# Import pacakges
import numpy as np
import math
# Data Visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.pyplot import figure
from matplotlib import style
from matplotlib import ticker
from wordcloud import WordCloud, STOPWORDS
# Data Manipulation
import pandas as pd
from datetime import date, time, datetime
import re
# Packages for SQL
import psycopg2
import sqlalchemy as sa
from sqlalchemy.engine import URL
from sqlalchemy import text
# Packages for Statistical Analysis
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
# Import datasets
carfeatures = pd.read_csv("data/features.csv")
results_raw = pd.read_csv("data_raw/results.csv")
results = pd.read_csv("data_raw/results.csv")
races_raw = pd.read_csv("data_raw/races.csv")
races = pd.read_csv("data_raw/races.csv")
circuits_raw = pd.read_csv("data_raw/circuits.csv")
circuits = pd.read_csv("data_raw/circuits.csv")
financial = pd.read_csv("data_raw/financial.csv")
bills_actions = pd.read_csv("data_raw/bills_actions.csv")
portfolios = pd.read_csv("data_raw/portfolios.csv")
cars = pd.read_csv("data_raw/cars.csv")
portfolios["date"] = pd.to_datetime(portfolios["date_str"])
# Global Codes
np.random.seed(5120)
We use the function type() to identify the type of object: integers (int), floats, or strings (str).
type(3)
int
type(3.5)
float
type("hello")
str
print(3 * 2)
print(3 + 2)
print(3 - 2)
print(3 / 2)
print(3 ** 2) # Exponentiation
6 5 1 1.5 9
We can use () for composite operations
(3 + 4) / 5
1.4
We can assign values to variables and apply operators on variables.
number3 = 3
number3andhalf = 3.5
message_hello = "hello"
print(number3 * 2)
print(number3andhalf ** 2)
print( (number3 + 4) / 5 )
6 12.25 1.4
Concatenate ("add") two strings
name = "Jiuru"
print("My name is " + name)
My name is Jiuru
We use [...] to denote lists. Elements in lists are separated by commas.
list_numbers = [1, 2, 3, 4, 5]
list_numbers_sqr = [1, 4, 9, 16, 25]
The type of elements in a list can be anything.
list_colors = ["red", "yellow", "yellow", "green", "red"]
list_mixedtype = ["red", 1, "yellow", 4, 5]
To extract individual elements from a list, we need to understand python lists start at the zero position. In other words, the first element in the list has an index of 0. We use [index] to extract elements from a list
floors_england = ["ground", "floor1", "floor2" ]
print(floors_england[0])
print(floors_england[1])
print(floors_england[2])
ground floor1 floor2
# Recall the list_colors we defined before
print(list_colors[0])
print(list_colors[1])
print(list_colors[2])
print(list_colors[3])
print(list_colors[4])
red yellow yellow green red
We can create lists with null values.
list_answers = [None,None,None]
print(list_answers)
[None, None, None]
Now, we can assigning or replacing values to lists
list_answers[0] = "Atlanta"
list_answers[1] = "Chicago"
list_answers[2] = "Boston"
print(list_answers)
['Atlanta', 'Chicago', 'Boston']
We can use [] to create an empty list and use list.append() to append values to lists.
new_list = []
new_list.append("Tuesday")
new_list.append("Wednesday")
new_list.append("Thursday")
new_list.append("Friday")
new_list.append("Saturday")
print(new_list)
['Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
We can create lists with repeated values by multiply as list with an integer
# Repeat a single value 30 times
list_two_rep = [7] * 30
# Repeat a list 4 times
list_answers_rep = list_answers * 4
# Repeat of 8 null values
list_none_rep = [None] * 8
print(list_two_rep)
print(list_answers_rep)
print(list_none_rep)
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7] ['Atlanta', 'Chicago', 'Boston', 'Atlanta', 'Chicago', 'Boston', 'Atlanta', 'Chicago', 'Boston', 'Atlanta', 'Chicago', 'Boston'] [None, None, None, None, None, None, None, None]
We can use the function len() to count the length of a list
print(len(list_answers))
print(len(list_two_rep))
print(len(list_answers_rep))
3 30 12
*Exercise*
list_personal = []
list_personal.append("Hello")
list_personal.append("World")
print(len(list_personal))
# There are two ways to change the last value of a list:
list_personal[-1] = "Last element"
list_personal[len(list_personal)-1] = "Last element"
list_personal
2
['Hello', 'Last element']
*Note*: list[-1] calls the last element in the list.
We can use list(range(n)) to create a list from 0 to n-1.
n = 10
list_zero_ten = list(range(n))
print(list_zero_ten)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
We can use == to test the equality of two strings.
"Is this the real life?" == "is this just fantasy?"
False
# Another example
any_questions = "no"
print(any_questions == "yes" )
False
To test for the presence of keywords in a sentence, we use in. The syntax is
keyword in sentence
keyword = "alejandro"
sentence = "The Federal Reserve makes forecasts about many economic outcomes"
keyword in sentence
False
We can also use in to test whether an element is a part of a list.
current_month = "July"
list_summer_months = ["June","July","August"]
print(current_month in list_summer_months)
True
*Exercise*
Write code to check whether the string "red" is contained in the list
["red","green","yellow","orange"]
color = "red"
color_list = ["red", "green", "yellow", "orange"]
print(color in color_list)
True
When testing with numbers, we have:
x = 4
print(x < 5)
print(x <= 5)
print(x == 5)
print(x >= 5)
print(x > 5)
True True False False False
If we want to validate a data type, we can use the isinstance() function. The syntax is
isinstance(x, type)
y = "Monday"
print(isinstance(y, int))
print(isinstance(y, float))
print(isinstance(y, str))
False False True
Equality of vectors is done element-by-element
vec_a = np.array([1,2,3])
vec_b = np.array([1,2,4])
vec_a == vec_b
array([ True, True, False])
When testing multiple expressions, we have not, and (&), and or (|).
age = 10
# Can this person legally vote in the US?
print(age >= 18)
print(not (age < 18))
False False
age = 31
# Is this age between 20 and 30 (including these ages)?
( age >= 20 ) & (age <= 30)
False
student_status = "freshman"
# Is the student in the first two years of undergrad?
(student_status == "freshman") | (student_status == "sophomore")
True
*Exercise*
Write code that checks the following conditions:
age = 28
print(age < 20 & age > 30)
print(not((age >= 25) & (age <= 27)))
False True
We can sum a list containing boolean operators.
list_boolean = [True,False,True,False,False]
np.mean(list_boolean)
0.4
if/elif/else)¶The following is the syntax for flow control:
if condition:
body_statement_1
elif condition:
body_statement_2
elif condition:
body_statment_3
...
else:
body_statement_4
any_questions = "yes"
if any_questions == "yes":
print("Need to give more explanations")
Need to give more explanations
is_graph_red = False
how_many_classes = np.array([7,1,2,3,3,3,4,5,6])
if is_graph_red:
plt.hist(x = how_many_classes, color="red")
plt.title("Count of students in each category")
plt.xlabel("How many classes are you taking?")
plt.show()
else:
plt.hist(x = how_many_classes, color="purple")
plt.title("Count of students in each category")
plt.xlabel("How many classes are you taking?")
plt.show()
years_in_program = 4
if years_in_program == 1:
print("This student is a freshman")
elif years_in_program == 2:
print("This student is a sophomore")
elif years_in_program == 3:
print("This student is a junior")
else:
print("This student is a senior")
This student is a senior
*Exercise*
" The sum is greater than or equal to 5"
c = np.array([1, 2, 3])
if sum(c) >= 5:
print("The sum is greater than or equal to 5")
else:
print("It is strictly less than 5")
The sum is greater than or equal to 5
for Loops¶This is the basic syntax for a for loop:
for value in list_values:
statement_body
list_ids = ["KIA", "Ferrari", "Ford", "Tesla"]
for id in list_ids:
print(f"Dear customer, we are writing about your {id} car.")
Dear customer, we are writing about your KIA car. Dear customer, we are writing about your Ferrari car. Dear customer, we are writing about your Ford car. Dear customer, we are writing about your Tesla car.
# Customized Message + Numbering
list_ids = ["KIA", "Ferrari", "Ford", "Tesla"]
index = 1
for id in list_ids:
print(f"Dear customer, your position is {str(index)} on the waitlist and your car brand is {id}")
index += 1
Dear customer, your position is 1 on the waitlist and your car brand is KIA Dear customer, your position is 2 on the waitlist and your car brand is Ferrari Dear customer, your position is 3 on the waitlist and your car brand is Ford Dear customer, your position is 4 on the waitlist and your car brand is Tesla
# Plot Multiple Variables
carfeatures = pd.read_csv("data/features.csv")
list_vars = ["acceleration", "weight", "displacement"]
for variable_name in list_vars:
plt.scatter(x= carfeatures[variable_name], y = carfeatures["mpg"])
plt.ylabel("mpg")
plt.xlabel(variable_name)
plt.show()
# Plots for Multiple Variables + Numbering
carfeatures = pd.read_csv("data/features.csv")
list_vars = ["acceleration","weight"]
index = 1
for variable_name in list_vars:
plt.scatter(x= carfeatures[variable_name], y = carfeatures["mpg"])
plt.ylabel("mpg")
plt.xlabel(variable_name)
plt.title(f"Figure {str(index)} The relationship between mpg and {str(variable_name)}")
plt.show()
index += 1
# Math Operationg (Appending)
n = 50
list_x = [1,2,4,5,6,7,8,9,10]
list_y = []
# Create an index
index = 0
for x in list_x:
y = list_x[index]**2 + 2*x
list_y.append(y)
index = index + 1
print(list_y)
plt.scatter(list_x, list_y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
[3, 8, 24, 35, 48, 63, 80, 99, 120]
Text(0, 0.5, 'Y-axis')
# Or we can use NumPy array to complete the task
list_x = np.arange(50)
list_y = list_x**2 + 2*list_x
plt.scatter(list_x, list_y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
Text(0, 0.5, 'Y-axis')
# Math Operations + Numbering (Filling)
# Create a list of x-values list_x = [1,2,4,5, ..., 50]
# Create a list of y-values to fill in later.
n = 50
list_x = list(range(1,n,1))
list_y = [None] * len(list_x)
# Create an index
index = 0
for x in list_x:
list_y[index] = list_x[index]**2
index += 1
print(list_y)
plt.scatter(list_x, list_y)
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401]
Text(0, 0.5, 'Y-axis')
*Exercise*
list_variables = ["weight", "acceleration","mpg"]
list_variables = ["weight", "acceleration", "mpg"]
for variable in list_variables:
plt.hist(x = carfeatures[variable])
plt.ylabel("Frequency")
plt.xlabel(str(variable))
plt.show()
*Exercise*
list_datasets = ["features.csv","worldbank_wdi_2019.csv"]
list_datasets = ["features.csv", "worldbank_wdi_2019.csv"]
for dataset in list_datasets:
df = pd.read_csv(f"data/{dataset}")
print(df.describe())
mpg cylinders displacement weight acceleration
count 398.000000 398.000000 398.000000 398.000000 398.000000
mean 23.514573 5.454774 193.427136 2970.424623 15.568090
std 7.815984 1.701004 104.268683 846.841774 2.757689
min 9.000000 3.000000 68.000000 1613.000000 8.000000
25% 17.500000 4.000000 104.250000 2223.750000 13.825000
50% 23.000000 4.000000 148.500000 2803.500000 15.500000
75% 29.000000 8.000000 262.000000 3608.000000 17.175000
max 46.600000 8.000000 455.000000 5140.000000 24.800000
life_expectancy gdp_per_capita_usd
count 252.000000 255.000000
mean 72.682931 17230.949757
std 7.382636 25792.183785
min 52.910000 216.972968
25% 67.109750 2186.046581
50% 73.599000 6837.717826
75% 78.234892 19809.323135
max 85.078049 199377.481800
for Loops¶List Comprehension
list_name = [ expression for value in list_values]
id_list = ["KIA", "Ferrari", "Ford", "Tesla"]
message_list = ["Your car model is :" + id for id in id_list]
print(message_list)
['Your car model is :KIA', 'Your car model is :Ferrari', 'Your car model is :Ford', 'Your car model is :Tesla']
# Customized Message + Numbering
topic_list = ["Python", "Python","SQL"]
module_list = ["One", "Two", "Three"]
num_topics = len(topic_list)
message_list = ["Module " + module_list[i] + " will cover " + topic_list[i] for i in range(num_topics)]
print(message_list)
['Module One will cover Python', 'Module Two will cover Python', 'Module Three will cover SQL']
# Math Operations
x_list = [ 1,2,3,4,5,6,7 ]
x_sqr_list = [ x**2 for x in x_list ]
print(x_sqr_list)
[1, 4, 9, 16, 25, 36, 49]
We can skip iterations by using continue. This feature is usually combined with if/else statements.
list_mixed = [1,2,"text_message",5]
for value in list_mixed:
if(not isinstance(value,int)):
continue
print(value)
1 2 5
Similarly, we can stop the loop by using break.
list_mixed = [1,2,"text_message",5]
for value in list_mixed:
if(not isinstance(value,int)):
print("Stopped: There is an element in your list that isn't an integer")
break
print(value)
1 2 Stopped: There is an element in your list that isn't an integer
while Loops¶The basic syntax of while loops is given below:
while test_expression:
body_statement
*Example*: More accuracy
$$f(x) = \frac{1}{x}$$Similar logic for any algorithm that needs more iterations for accuracy
e = 0.01
x_list = []
y_list = []
print(x_list)
x = 1
while 1/x > e:
x_list.append(x)
y_list.append(1/x)
x = x + 1
plt.scatter(x= x_list, y = y_list)
plt.title("Graph of f(x) = 1/x")
plt.xlabel("x")
plt.ylabel("y")
[]
Text(0, 0.5, 'y')
*Example*: Finding candidates with a cutoff
# This code loops over the list of players
# and stops once 2 tennis players are found
list_sports = ["tennis","golf","golf","tennis","golf","golf"]
candidate_list = []
num_candidates = 0
index = 0
while num_candidates < 2:
print(index)
if(list_sports[index] == "tennis"):
candidate_list.append(index)
num_candidates = num_candidates + 1
index = index + 1
print(candidate_list)
0 1 2 3 [0, 3]
A function is ...
"Built-in" functions are those from Python libraries, e.g.
print(), type(), round(),abs(), len()
# Argument: "Hello"
# Return: Showing the message on screen
print("Hello")
Hello
# Argument: 3.14
# Return: The type of object, e.g. int, str, boolean, float, etc.
type(3.14)
float
# First Argument: np.pi (a numeric value)
# Second Argument: 6 (the number of decimals)
# Return: Round the first argument, given the number of decimals in the second argument
round(np.pi, 6)
3.141593
# Argument: -4
# Return: The absolute value
abs(-4)
4
list_fruits = ["Apple","Orange","Pear"]
# Argument: list_fruists
# Return: The number of elements in the list
len(list_fruits)
3
Enter arguments by assigning parameters
vec_x = np.random.chisquare(df = 2, size = 20)
vec_y = np.random.normal(loc = 2, scale = 5, size = 20)
vec_z = np.random.uniform(low = -2, high =2, size = 50)
You can write your own functions:
#---- DEFINE
def my_function(parameter):
body
return expression
#---- RUN
my_function(parameter = argument)
#---- RUN
my_function(argument)
*Example*
Calculate $V=P\left(1+{\frac {r}{n}}\right)^{nt}$
# We are going to define a function "fn_compute_value"
# You can choose any name
# Using prefixes like "fn_" can help you remember this is a "function" object
# The parameters are
def fn_compute_value(P, r, n, t):
V = P * (1 + r/n) ** (n * t)
return V
# You can know compute the formula with different values
V1 = fn_compute_value(P = 1000, r = 0.01, n = 20, t=10)
V2 = fn_compute_value(P = 10, r = 0.01, n = 20, t=10)
print(V1)
print(V2)
1105.1432983541217 11.051432983541218
*Example*
Write a function that calculates $f(x) = x^2 + 2x + 1$.
def compute_function(x):
f = x ** 2 + 2 * x + 1
return f
compute_function(1)
4
*Example*
Write a function
def numeric_grade(grade):
if grade >= 55:
status = "pass"
else:
status = "fail"
return status
print(numeric_grade(100))
pass
*Example*
Write a function
"Dear customer {first_name} {last_name}, your car model {car_model} is ready"
def message(first_name, last_name, car_model):
return f"Dear customer {first_name} {last_name}, your car model {car_model} is ready."
message("Jiuru", "Lyu", "Tesla")
'Dear customer Jiuru Lyu, your car model Tesla is ready.'
*Lambda Functions*
"Lambda Functions" are defined in one line:
my_function = lambda parameters: expression
*Example* Calculate $x + y + z$
# (a) Define function
fn_sum = lambda x,y,z: x + y + z
# (b) Run function
fn_sum(1,2,3)
6
*Example*
Calculate $V=P\left(1+{\frac {r}{n}}\right)^{nt}$
fn_compute_value = lambda P,r,n,t: P*(1 + r/n)**(n*t)
V1 = fn_compute_value(P = 1000, r = 0.01, n = 20, t=10)
V2 = fn_compute_value(P = 10, r = 0.01, n = 20, t=10)
print(V1)
print(V2)
1105.1432983541217 11.051432983541218
*Example*
Boleean + Functions:
fn_iseligible_vote = lambda age: (age >= 18)
print(fn_iseligible_vote(19))
True
*Example*
Looping + Functions:
list_ages = [18, 29, 15, 32, 6]
list_eligible = []
for age in list_ages:
list_eligible.append(fn_iseligible_vote(age))
print(list_eligible)
[True, True, False, True, False]
*Functions for Visualization*
Returning a value is not always necesary, you can write:
#---- DEFINE
def my_function(parameter):
body
*Example*
A customized plot
def red_histogram(vec_x,title):
plt.hist(x = vec_x, color = "red")
plt.title(title)
plt.ylabel("Frequency")
plt.show()
red_histogram(vec_x = carfeatures["weight"], title = "Histogram")
red_histogram(vec_x = carfeatures["acceleration"], title = "Histogram")
*Example*
Create a function that computes a red scatter plot that takes $y$ and $x$ inputs
def red_scatter(vec_x, vec_y, title):
plt.scatter(x = vec_x, y = vec_y, color = "red")
plt.title(title)
plt.show()
red_scatter(vec_x = carfeatures["weight"], vec_y = carfeatures["acceleration"], title = "Acceleration vs. Weight")
Most of the variables we've defined so far are "global"
message_hello = "hello"
number3 = 3
print(message_hello + " world")
print(number3 * 2)
hello world 6
Any "global" variable can be referenced inside functions
# Correct Example:
def fn_add_recommended(x,y,z):
return(x + y + z)
print(fn_add_recommended(x = 1, y = 2, z = 5))
print(fn_add_recommended(x = 1, y = 2, z = 10))
8 13
# Example that runs (but not recommended)
# Python will try to fill in any missing inputs
# with variables in the working environment
def fn_add_notrecommended(x,y):
return(x + y + z)
z = 5
print(fn_add_notrecommended(x = 1, y = 2))
z = 10
print(fn_add_notrecommended(x = 1, y = 2))
8 13
Variables defined inside functions are "local"
# This is an example where we define a quadratic function
# (x,y) are both local variables of the function
#
# When we call the function, only the arguments matter.
# any intermediate value inside the function
def fn_square(x):
y = x**2
return(y)
x = 0
y = 1
print(fn_square(x = 10))
100
Local variables are not stored in the working environment
x = 5
y = 4
print("Example 1:")
print(fn_square(x = 10))
print(x)
print(y)
print("Example 2:")
print(fn_square(x = 20))
print(x)
print(y)
Example 1: 100 5 4 Example 2: 400 5 4
To permanently modify a variable, use the "global" command
def modify_x():
global x
x = x + 5
x = 1
# Now, running the function wil permanently increase x by 5.
modify_x()
print(x)
6
*Example*
x = 1
modify_x()
modify_x()
print(x)
def fn_square(x):
global y
y = x**2
return(y)
fn_square(x)
11
121
A dictionary is another way to store data.
# This is an example of a pandas data frame created from a dictionary
# This example illustrates the basic syntax of a dictionary
car_dictionary = {"car_model": ["Ferrari","Tesla","BMW"],
"year": ["2018","2023","2022"] }
pd.DataFrame(car_dictionary)
| car_model | year | |
|---|---|---|
| 0 | Ferrari | 2018 |
| 1 | Tesla | 2023 |
| 2 | BMW | 2022 |
\¶Use a backslash to define strings over multiple lines
example_string = "This is a string \
defined over multiple lines"
print(example_string)
This is a string defined over multiple lines
Backslash and Quotation Marks
# Use a backslash + quotation
example_double = "This will \"supposedly\" put double quotes inside a string"
print(example_double)
This will "supposedly" put double quotes inside a string
# There is no need for a backslash given single quotes
example_single = "This will 'supposedly' put single quotes inside a string"
print(example_single)
This will 'supposedly' put single quotes inside a string
*Example*:
SELECT "driverId" FROM results;using backslash
example_string = "SELECT \"driverId\" FROM results;"
print(example_string)
SELECT "driverId" FROM results;
Convert to string (a)
% is used to denote date formats$\quad$ 
# "String from time" .dt.strftime()
# The first argument needs to be a datetime type
# The second argument is the format you want to use
# Note: "dt" stands for datatime
financial["month_str"] = financial["date"].dt.strftime("%m")
financial["week_str"] = financial["date"].dt.strftime("%W")
$\quad$ 
financial["monthname"] = financial["date"].dt.strftime("%B")
financial["weekdayname"] = financial["date"].dt.strftime("%A")
We can also use personalized formats
# Insert wildcards inside custom strings
# Internally it will "fill-in-the-blank" with the corresponding information
# You can use commas, dashes (--), slash (/) or other characters
message_monthname = financial["date"].dt.strftime("This is the month of %B")
message_monthday = financial["date"].dt.strftime("The day of the week is %A")
message_yearmonth = financial["date"].dt.strftime("%Y-%m")
display(message_yearmonth)
0 2018-04
1 2018-04
2 2018-04
3 2018-04
4 2018-04
...
1300 2023-03
1301 2023-03
1302 2023-03
1303 2023-04
1304 2023-04
Name: date, Length: 1305, dtype: object
*Example*
.dt.strftime()$\quad$ Monday, December 31, 2023
financial["date_test"] = financial["date"].dt.strftime("%A, %B %d, %Y")
# Examples of NumPy numbers
np.pi
3.141592653589793
The following code are using NumPy functions of the following mathematical expressions: $$ \ln{x},\ e^x,\ \sin{x},\ \cos{x},\ \sqrt{x}. $$
print(np.log(1))
print(np.exp(1))
print(np.sin(1))
print(np.cos(1))
print(np.sqrt(1))
0.0 2.718281828459045 0.8414709848078965 0.5403023058681398 1.0
*Exercise*
x = 5
print(np.pi * x**2)
print(1 / np.sqrt(2 * np.pi) * np.exp((-x ** 2)))
78.53981633974483 5.540487995575833e-12
The following code creates an array from a list for the following vectors: $$ a = \left(\begin{matrix}1\\2\\3\end{matrix}\right), b = \left(\begin{matrix}0\\1\\0\end{matrix}\right), c = \left(\begin{matrix}10\\100\\1000\\2000\\5000\end{matrix}\right) $$
vec_a = np.array([1,2,3])
vec_b = np.array([0,1,0])
vec_c = np.array([10,100,1000,2000,5000])
*Exercise*
Create an array for vector $d = \left(\begin{matrix}4\\2\end{matrix}\right)$
vec_d = np.array([4, 2])
Remember, arrays are like lists, starting their numbering at zero. We can extract values from arrays using [index].
print(vec_a[0])
print(vec_a[2])
1 3
We can use basic operators to operate with a single array and a scalar. For example, $$ a + 2 = \begin{pmatrix} a_1 + 2 \\ a_2 + 2 \\ a_3 + 2 \end{pmatrix} $$
print(vec_a * 2)
print(vec_a / 2)
print(vec_a + 2)
print(vec_a ** 2)
[2 4 6] [0.5 1. 1.5] [3 4 5] [1 4 9]
When we are operating with two arrays, we are doing element-by-element operations. $$ a + b = \begin{pmatrix} a_1 \\ a_2 \\ a_3 \end{pmatrix} + \begin{pmatrix} b_1 \\ b_2 \\ b_3 \end{pmatrix} = \begin{pmatrix} a_1 + b_1 \\ a_2 + b_2 \\ a_3 + b_3 \end{pmatrix}$$
$$ a * b = \begin{pmatrix} a_1 * b_1 \\ a_2 * b_2 \\ a_3 * b_3 \end{pmatrix}$$*Note*: When doing element-by-element operations, make sure the arrays have the same size.
print(vec_a + vec_b)
print(vec_a * vec_b)
print(vec_a - vec_b)
print(vec_a / vec_b)
[1 3 3] [0 2 0] [1 1 3] [inf 2. inf]
/var/folders/_b/554mn6jx1vv8940qhs9gysv80000gn/T/ipykernel_36370/300864503.py:4: RuntimeWarning: divide by zero encountered in true_divide print(vec_a / vec_b)
We can get a statistical summary of an array using the following code
print(np.mean(vec_a))
print(np.std(vec_a))
print(np.min(vec_a))
print(np.median(vec_a))
print(np.max(vec_a))
2.0 0.816496580927726 1 2.0 3
The following code creates the matrix $X$, where
$$ X = \begin{pmatrix} 1 & 2 & 3 \\ 4 & 5 & 5 \\ 7 & 8 & 9 \end{pmatrix} $$X = np.array([[1,2,3],[4,5,6],[0,8,9]])
print(X)
[[1 2 3] [4 5 6] [0 8 9]]
We can use np.column_stack() to create a matrix by stacking different columns.
Y = np.column_stack([[1,0,1],[0,1,0]])
print(Y)
[[1 0] [0 1] [1 0]]
We can use np.matrix.transpose() to calculate the transpose of a matrix.
np.matrix.transpose(Y)
array([[1, 0, 1],
[0, 1, 0]])
We use np.matmul() to do matrix multiplications.
*Note*: Remember to check the dimensions of two matrices before doing the multiplication.
np.matmul(X,Y)
array([[ 4, 2],
[10, 5],
[ 9, 8]])
We use np.linalg.inv() to calculate the inverse of a matrix.
X_inv = np.linalg.inv(X)
print(X_inv)
[[-0.14285714 0.28571429 -0.14285714] [-1.71428571 0.42857143 0.28571429] [ 1.52380952 -0.38095238 -0.14285714]]
Why randomness?
To create a vector of random variables, we can use np.random.normal(). More functions to generate random variables will be discussed in section 5.1.
randomvar_a = np.random.normal(loc=0, scale=1, size=10)
print(randomvar_a)
[ 0.04929356 0.29161093 -2.19586755 0.52144867 -2.07762544 0.04775888 0.32964566 1.48870342 -0.35287619 1.90266298]
Random numbers different every time we run the code. To avoid this problem, we need to use the function np.random.seet().
# No matter how many times we run this code chunk,
# The result will always be the same
np.random.seed(12345)
random_var_b = np.random.normal(loc=0, scale=1, size=10)
print(random_var_b)
[-0.20470766 0.47894334 -0.51943872 -0.5557303 1.96578057 1.39340583 0.09290788 0.28174615 0.76902257 1.24643474]
The visualize random numbers, we could use histograms.
randomvar_x = np.random.normal(loc=0, scale=1, size=10000)
plt.hist(x = randomvar_x)
plt.xlabel("Variable a")
plt.ylabel("Frequency")
Text(0, 0.5, 'Frequency')
We can create two arrays with and without NaNs.
Note: The np.array() function converts a list to an array
vec_without_nans = np.array([1, 1, 1])
vec_with_nans = np.array([np.nan, 4, 5])
When you add the vectors with NaN, you will produce an error on any entries with "NaNs"
print(vec_without_nans * vec_with_nans)
print(vec_without_nans / vec_with_nans)
print(vec_without_nans + vec_with_nans)
print(vec_without_nans - vec_with_nans)
[nan 4. 5.] [ nan 0.25 0.2 ] [nan 5. 6.] [nan -3. -4.]
Similarly, some summary statistics will not work if there are NaNs.
For example The np.mean() doesn't work if the mean includes NaNs
print(np.mean(vec_with_nans))
nan
However, some commands ignore the NaNs.
For example, The np.nanmean() computes the mean over the numeric observations.
print(np.nanmean(vec_with_nans))
4.5
However, pandas statistical summary always ignores NaNs.
dataset = pd.DataFrame([])
dataset["x"] = vec_with_nans
dataset["x"].mean()
4.5
We use the following code to create a scatter plot based on a data frame.
plt.scatter(x = carfeatures['weight'], y = carfeatures['mpg'])
plt.show()
# Another example
plt.scatter(x = carfeatures['acceleration'], y = carfeatures['mpg'])
plt.show()
Despite visualizing data frames, we can also visualize lists
# Recall the list_colors we defined in section 1.
plt.hist(x = list_colors)
(array([2., 0., 0., 0., 0., 2., 0., 0., 0., 1.]), array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]), <BarContainer object of 10 artists>)
# Another example
myChoice = [0, 1, 0, 0, 0, 1, 0, 0, 0]
plt.hist(myChoice)
(array([7., 0., 0., 0., 0., 0., 0., 0., 0., 2.]), array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), <BarContainer object of 10 artists>)
# Scatter plots
plt.scatter(x = list_numbers, y = list_numbers_sqr)
plt.xlabel("A meaningful name for the X-axis")
plt.ylabel("Favourite name for Y-axis")
plt.title("A graph showing the square of a list of numbers")
plt.show()
*Exercise*
Create a list with numbers, and then create your own scatter plot
e = math.e
xList = [0, 1, 2, 3, 4, 5]
yList = [e**0, e**1, e**2, e**3, e**4, e**5]
plt.scatter(x = xList, y = yList)
plt.xlabel("Numbers")
plt.ylabel("Exponentiation of Numbers")
plt.title("A graph showing the exponentiation of a list of numbers")
plt.show()
We can create multiple plots in a row, using subplots
#------------------------ Setting up subplots----------------------------------#
# Create a plot with 1 row, 2 columns
# You will create a list of subfigures "list_subfig"
# You can choose whichever name you like
# The option "figsize" indicates the (width,height) of the graph
fig, list_subfig = plt.subplots(1, 2,figsize = (6,3))
# The tight layout option ensures that the axes are not overlapping
plt.tight_layout()
# First Figure
list_subfig[0].hist(x = carfeatures["weight"])
list_subfig[0].set_title("Weight")
list_subfig[0].set_xlabel("Value of Weight")
list_subfig[0].set_ylabel("Value")
# Second Figure
list_subfig[1].hist(x = carfeatures["mpg"])
list_subfig[1].set_title("mpg Distribution")
list_subfig[1].set_xlabel("Value of mpg")
list_subfig[1].set_ylabel("Value")
# Note:
# Use the set_title() function for the title of subfigures
# Similarly, use "set_xlabel()" and "set_ylabel()"
Text(291.2335858585858, 0.5, 'Value')
# Create an empty layout with 1 row x 1 column
# "figsize" takes a list with the width and height
fig, ax = plt.subplots(1, 1, figsize = [5, 3])
# Extract information that we want to plot
# Indicate that "date" is the x-axis
plot_data = portfolios[["date", "growth_sp500", "growth_djia"]].set_index("date")
# Generate basic lineplot
# We add with the subplot environment and add more info
# The linewidth option controls how thin the lines are
# With subplots we use "set_xlabel" rather than "xlabel"
fig, ax = plt.subplots(1, 1)
ax.plot(plot_data,
linewidth = 1)
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Daily percentage growth")
ax.set_xlabel("Date")
ax.set_title("Portfolio performance over time")
Text(0.5, 1.0, 'Portfolio performance over time')
# Baseline plot
fig, ax = plt.subplots(1, 1)
ax.hist(x = portfolios["growth_sp500"])
ax.hist(x = portfolios["growth_djia"])
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Frequency")
ax.set_xlabel("Daily percentage growth")
ax.set_title("Histogram of portfolio growth")
Text(0.5, 1.0, 'Histogram of portfolio growth')
*Example*
Create a scatter plot of growth_sp500 (y-axis) and growth_djia (x-axis) using .subplots()
Label the axes and the title.
HINT: Write ax.scatter(x = ..., y = ...)
fig, ax = plt.subplots(1, 1)
ax.scatter(x = "growth_djia", y = "growth_sp500", data = portfolios, color = "pink")
ax.set_xlabel("Growth of Dow Jones")
ax.set_ylabel("Growth of S&P 500")
ax.set_title("Scatter plot of portfolio growth")
ax.legend(["S&P 500 vs. Dow Jones"])
<matplotlib.legend.Legend at 0x7f9ec2da2f10>
"Parse" time columns:
Convert string column to datetime format. If the date format is simple, you can also parse on input as
financial = pd.read_csv("data_raw/financial.csv",parse_dates = ["date"]
financial["date"] = pd.to_datetime(financial["date_str"])
# Check Types
# Standard data types are "int", "str", "float", and "bool"
# There is also a "datetime" types
financial.dtypes
Unnamed: 0 int64 date_str object sp500 float64 djia float64 date_ex1 object date_ex2 object date_ex3 object date datetime64[ns] dtype: object
Visualize Time data
# plt.plot() is used to create line plots
# The first two arguments are column names for the (x,y) data
# The third argument is the data
plt.plot("date", "sp500", data = financial)
plt.xlabel("Time")
plt.ylabel("S&P 500 Index")
plt.title("The evolution of the stock market")
Text(0.5, 1.0, 'The evolution of the stock market')
*Example*
Industrial Index ("djia") on the y-axis and "date" on the x-axis.
plt.plot("date", "djia", data = financial)
plt.xlabel("Time")
plt.ylabel("Dow Jones Industrial Index")
plt.title("The Evolution of the Stock Market")
Text(0.5, 1.0, 'The Evolution of the Stock Market')
Parsing + wild cards
# Combine wildcards + characters depending on the input
# Can include spaces, commas, "/", "-" or any other formatting
# Be careful to include the wildcar letters in upper or lower case
# depending on the intended format
date1 = pd.to_datetime(financial["date_ex1"], format = "%B %d %Y")
date2 = pd.to_datetime(financial["date_ex2"], format = "%A, %Y-%m-%d")
Period grouping
# In "freq" specify the letter for the level of aggregation
# year (y), month (m), week (w), day(d)
# There are also more advanced options! See documentation
financial["week"] = financial["date"].dt.to_period(freq = "w")
Aggregate by period
# Group on the period column
# We use a wrapper () to split the command into multiple lines
# We could also use escape characters \ instead
weeklydata = (financial
.groupby("week")
.agg( sp500_mean = ("sp500","mean")))
*Example*
pd.to_datetime()HINT: Refer to the table of wildcards in the previous section
date3 = pd.to_datetime(financial["date_ex3"], format = "%b-%d, %y")
*Example*
# In "freq" specify the letter for the level of aggregation
# year (y), month (m), week (w), day(d)
# There are also more advanced options! See documentation
month_config = pd.Grouper(key='date', freq='m')
# Group on the period column
# We use a wrapper () to split the command into multiple lines
# The ".reset_index()" option ensures that the grouper is
# converted to a column. This is important for plotting.
# There's a lot of options to
monthlydata = (financial
.groupby(month_config)
.agg(sp500_mean = ("sp500", "mean"))
.reset_index())
plt.plot("date", "sp500_mean",
data = monthlydata.sort_values("date", ascending = True))
plt.xlabel("Time")
plt.ylabel("S&P 500")
plt.title("Monthly average stock market performance")
Text(0.5, 1.0, 'Monthly average stock market performance')
*Example*
deviation of the S&P 500 at the weekly level.
week_config = pd.Grouper(key='date', freq='w')
weeklydata = (financial
.groupby(week_config)
.agg(sp500_std = ("sp500", "std"))
.reset_index())
plt.plot("date", "sp500_std",
data = weeklydata.sort_values("date", ascending = True))
plt.xlabel("Time")
plt.ylabel("S&P 500")
plt.title("Weekly standard deviation stock market performance")
Text(0.5, 1.0, 'Weekly standard deviation stock market performance')
Plot multiple columns
# Enter the x-axis column and y-axis columns you
# wish to include. Specify the x-axis column with "set_index()"
# This applies to any line plot, with or without dates
# The legend is the box with the name of the lines
# If you drop the ".legend()" command this will assign
# the default column names to the legend.
financial[["date", "sp500", "djia"]].set_index("date").plot()
plt.xlabel("Time")
plt.ylabel("Value of Index Funds")
plt.legend(["S&P 500", "Dow Jones"])
<matplotlib.legend.Legend at 0x7f7d32031d90>
Remarks ...
Change between periods
# First sort columns by date. The second computes the
# differences in "sp500" between each row and the one before it
# By convention, the first row gets a missing value because
# there is nothing to compare. For this to work, it's important
# that the dataset is sorted.
financial["diff_sp500"] = financial["sp500"].diff()
Compute lag + percentage growth
# ".shif(1)" compute a new column with the value of "sp500"
# one period before. By convention the first column is assigned
# a missing value
# We can combine ".diff()" and ".shift()" to compute growth rates
financial["lag_sp500"] = financial["sp500"].shift(1)
financial["growth_sp500"] = financial["diff_sp500"] * 100 / financial["lag_sp500"]
Time between dates
# In the financial data example, the price of the stock portfolios isn't recorded
# on weekends. Sometimes it's important to account for these differences in the
# legnth between time periods, when accounting for growth rates
# Can compute dt.days, dt.months, dt.year, etc.
financial["diff_date"] = financial["date"] - financial["date"].shift(1)
financial["count_days"] = financial["diff_date"].dt.days
Plot Growth
plt.plot("date", "growth_sp500",
data = financial.sort_values("date", ascending = True))
plt.xlabel("Time")
plt.ylabel("Daily percentage change ")
plt.title("Change in the S&P 500 Index")
Text(0.5, 1.0, 'Change in the S&P 500 Index')
*Example*
single plot
financial["diff_djia"] = financial["djia"].diff()
financial["shift_djia"] = financial["djia"].shift(1)
financial["growth_djia"] = financial["diff_djia"] * 100 / financial["shift_djia"]
# Or we could use
# financial["pct_change_djia"] = financial["djia"].pct_change() * 100
financial[["date", "growth_sp500", "growth_djia"]].set_index("date").plot()
plt.xlabel("Time")
plt.ylabel("Daily Percentage Change")
plt.title("Change in S&P 500 and Dow Jones Index")
plt.legend(["S&P 500", "Dow Jones"])
plt.show()
Subsetting before/after
# Since the "date" column has a time format, Python
# will interpret "2019-01-01" as a date inside the query command
# Note: remember that you have to use single quotations for ".query()"
subset_before = financial.query('date >= "2019-01-01" ')
subset_after = financial.query('date <= "2020-01-01" ')
Obtain a subset of between
# This command applies the function ".between()" to the column
subset_between = financial.query('date.between("2020-03-01","2020-05-01")')
Flag Observations
financial["bool_period"] = financial["date"].between("2020-03-01","2020-05-01")
financial["bool_example"] = financial["growth_sp500"] > 5
Plot Results
# Create a line plot
plt.plot("date", "growth_sp500", data = financial)
plt.xlabel("Time")
plt.ylabel("Daily percentage change ")
plt.title("The S&P 500 during the start of COVID")
# Add a shaded region wth a rectangle
# "x" is the x-coordinates, "y1" and "y2" are the lower
# and upper bounds of the rectangle. We can set this
# to be the minimum and meximum of the outcome.
# we use "where" to test a logical condition
vec_y = financial["growth_sp500"]
plt.fill_between(x= financial["date"],
y1 = vec_y.min(),
y2 = vec_y.max(),
where = financial["bool_period"],
alpha = 0.2,color = "red")
plt.show()
bills_actions.dtypes
Congress int64 bill_number int64 bill_type object action object main_action object category object member_id int64 dtype: object
We start by generating a string with text the WordCloud() command generates and object. To display it use plt.imgshow()
Words with higher frequency will tend to appear larger (see advanced options) at the end for how to adjust the relative scaling
text = "Introduction to Statistical Computing Python Python Python Python Python"
word_cloud = WordCloud(background_color = "white").generate(text)
plt.imshow(word_cloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Get word frequencies
word_frequencies = WordCloud().process_text(text)
word_frequencies
{'Introduction': 1, 'Statistical': 1, 'Computing': 1, 'Python': 5}
Adding stop words
# Create adjusted list of stop words
stop_words = list(STOPWORDS) + ["Introduction","Computing"]
text = "Introduction to Statistical Computing Python Python Python Python Python"
# Plot results
# If you don't wan't any stopwords, use "stopwords = []" instead
word_cloud = WordCloud(background_color = "white",
stopwords = stop_words).generate(text)
plt.imshow(word_cloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
*Example*:
text = "Emory University is a private research university in Atlanta, Georgia. Founded in 1836 as Emory College by the Methodist Episcopal Church and named in honor of Methodist bishop John Emory, Emory is the second-oldest private institution of higher education in Georgia. Emory University has nine academic divisions: Emory College of Arts and Sciences, Oxford College, Goizueta Business School, Laney Graduate School, School of Law, School of Medicine, Nell Hodgson Woodruff School of Nursing, Rollins School of Public Health, and the Candler School of Theology. Emory University students come from all 50 states, the District of Columbia, five territories of the United States, and over 100 foreign countries. Emory Healthcare is the largest healthcare system in the state of Georgia and comprises seven major hospitals, including Emory University Hospital and Emory University Hospital Midtown. The university operates the Winship Cancer Institute, Yerkes National Primate Research Center, and many disease and vaccine research centers. Emory University is the leading coordinator of the U.S. Health Department's National Ebola Training and Education Center. The university is one of four institutions involved in the NIAID's Tuberculosis Research Units Program. The International Association of National Public Health Institutes is headquartered at the university and the Centers for Disease Control and Prevention and the American Cancer Society are national affiliate institutions located adjacent to the campus. Emory University has the 15th-largest endowment among U.S. colleges and universities. The university is classified among \"R1: Doctoral Universities – Very high research activity\" and is cited for high scientific performance and citation impact in the CWTS Leiden Ranking. The National Science Foundation ranked the university 36th among academic institutions in the United States for research and development (R&D) expenditures. In 1995 Emory University was elected to the Association of American Universities, an association of the 65 leading research universities in the United States and Canada. Emory faculty and alumni include 2 Prime Ministers, 9 university presidents, 11 members of the United States Congress, 2 Nobel Peace Prize laureates, a Vice President of the United States, a United States Speaker of the House, and a United States Supreme Court Justice. Other notable alumni include 21 Rhodes Scholars and 6 Pulitzer Prize winners, as well as Emmy Award winners, MacArthur Fellows, CEOs of Fortune 500 companies, heads of state and other leaders in foreign government. Emory has more than 149,000 alumni, with 75 alumni clubs established worldwide in 20 countries."
word_cloud = WordCloud(background_color = "white").generate(text)
plt.imshow(word_cloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Concatanate column into single long sentence
# We start of with an empty string and sequentiall
# concatenate all the elements of bills["main_action"] together
text_bills = "".join(bills["action"])
#stopwords = list(STOPWORDS) + ["Introduction","Computing"]
#text = "Introduction to Statistical Computing Python Python Python Python Python"
word_cloud = WordCloud(background_color = "white").generate(text_bills)
plt.imshow(word_cloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Text by subgroup
# Create sets of words by category
subset_bills_house = bills_actions.query('category == "house bill"')
subset_bills_senate = bills_actions.query('category == "senate bill"')
# Create strings with all the words mentioned for those observations
text_house = "".join(subset_bills_house["action"])
text_senate = "".join(subset_bills_senate["action"])
# Use subplots to create figures with multiple plots
fig, list_subfig = plt.subplots(1,2,figsize = [8,3])
word_cloud_house = WordCloud(background_color = "white").generate(text_house)
list_subfig[0].imshow(word_cloud_house)
list_subfig[0].axis("off")
list_subfig[0].set_title("House of Representatives")
word_cloud_senate = WordCloud(background_color = "white").generate(text_senate)
list_subfig[1].imshow(word_cloud_senate)
list_subfig[1].axis("off")
list_subfig[1].set_title("Senate")
Text(0.5, 1.0, 'Senate')
Note: In general, many possibilities for splitting by subgroups! Years, geography, type of speaker, type of document, etc.
*Example*
house resolution and senate resolution (separately)plt.subplots() and the code shown abovesubset_resolution_house = resolutions.query('category == "house resolution"')
subset_resolution_senate = resolutions.query('category == "senate resolution"')
text_resolution_house = "".join(subset_resolution_house["action"])
text_resolution_senate = "".join(subset_resolution_senate["action"])
fig, list_subfig = plt.subplots(1,2,figsize = [8,3])
word_cloud_resolution_house = WordCloud(background_color = "white").generate(text_resolution_house)
list_subfig[0].imshow(word_cloud_resolution_house)
list_subfig[0].axis("off")
list_subfig[0].set_title("House of Representatives")
word_cloud_resolution_senate = WordCloud(background_color = "white").generate(text_resolution_senate)
list_subfig[1].imshow(word_cloud_resolution_senate)
list_subfig[1].axis("off")
list_subfig[1].set_title("Senate")
Text(0.5, 1.0, 'Senate')
Advanced Settings of Word Cloud
# Relative scaling of high-frequency words
text = "Introduction to Statistical Computing Python Python Python Python Python"
word_cloud = WordCloud(width = 400,
height = 200,
relative_scaling = 0.5, # or "auto"
background_color = "white").generate(text)
plt.imshow(word_cloud)
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Formatting axis labels
# Baseline plot
fig, ax = plt.subplots(1, 1)
ax.plot(portfolios[["date", "growth_sp500", "growth_djia"]].set_index("date"))
ax.legend(["S&P 500", "Dow Jones"])
# labelpad is the space between the labels and the numbers
ax.set_ylabel("Daily percentage growth",
fontsize = 20,
color = "pink",
labelpad = 20)
ax.set_xlabel("Date",
fontsize = 8,
color = "purple",
labelpad = 5)
Text(0.5, 0, 'Date')
Formatting ticks
# Baseline plot
fig, ax = plt.subplots(1, 1)
ax.plot(portfolios[["date", "growth_sp500", "growth_djia"]].set_index("date"))
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Daily percentage growth")
ax.set_xlabel("Date")
# Change format of ticks
# Rotation is an angle from -90 to 90 degrees
ax.xaxis.set_tick_params(labelsize = 10,
rotation = 45,
colors = "red")
ax.yaxis.set_tick_params(labelsize = 20,
rotation = 0,
colors = "blue")
*Example*
Copy-paste the code for the histogram above
Change the formatting of the x-axis by
set_tick_params()set_xlabel(...,labelpad = 15)Try different values of labelpad
fig, ax = plt.subplots(1, 1)
ax.hist(x = portfolios["growth_sp500"], alpha = 0.5)
ax.hist(x = portfolios["growth_djia"], alpha = 0.5)
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Frequency",
labelpad = 15)
ax.set_xlabel("Daily percentage growth",
labelpad = 15)
ax.set_title("Histogram of portfolio growth")
ax.xaxis.set_tick_params(rotation = 45)
ax.yaxis.set_tick_params(rotation = 45)
Formatting date axes
# Baseline plot
fig, ax = plt.subplots(1, 1)
ax.plot(portfolios[["date", "growth_sp500", "growth_djia"]].set_index("date"))
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Daily percentage growth")
ax.set_xlabel("Date")
# Establish the frequency of labels and their format
# Can also use "DayLocator","MonthLocator", "YearLocator",
# Use wildcards to set the year format: See lecture on time data
config_ticks = mdates.MonthLocator(interval = 12)
format_ticks = mdates.DateFormatter("%y-%m-%d")
ax.xaxis.set_major_locator(config_ticks)
ax.xaxis.set_major_formatter(format_ticks)
Formatting numeric axes
$\qquad$ 
# Baseline plot
fig, ax = plt.subplots(1, 1)
ax.plot(portfolios[["date", "growth_sp500", "growth_djia"]].set_index("date"))
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Daily percentage growth")
ax.set_xlabel("Date")
# Set number of ticks, configure them, and display them
M = 10
config_ticks = ticker.MaxNLocator(M)
format_ticks = ticker.FormatStrFormatter("%.1f")
ax.yaxis.set_major_locator(config_ticks)
ax.yaxis.set_major_formatter(format_ticks)
# Set graph limits manually
ax.set_ylim(-20,20)
(-20.0, 20.0)
*Example*
fig, ax = plt.subplots(1, 1)
ax.hist(x = portfolios["growth_sp500"], alpha = 0.5)
ax.hist(x = portfolios["growth_djia"], alpha = 0.5)
ax.legend(["S&P 500", "Dow Jones"])
ax.set_ylabel("Frequency")
ax.set_xlabel("Daily percentage growth")
ax.set_title("Histogram of portfolio growth")
M = 8
config_ticks = ticker.MaxNLocator(M)
format_ticks = ticker.FormatStrFormatter("%.4f")
ax.xaxis.set_major_locator(config_ticks)
ax.xaxis.set_major_formatter(format_ticks)
Anatomy of Figures
Export to File
plt.scatter(x = cars["hwy"],y = cars["cty"])
plt.xlabel("Miles per gallon (Highway)")
plt.ylabel("Miles per gallon (City)")
plt.title("Fuel efficiency in highways vs the city")
# Store in different formas
plt.savefig("results/scatter_cty_hwy.png")
plt.savefig("results/scatter_cty_hwy.jpg")
Pro Tips:
*Example*
plt.scatter(x = cars["hwy"],y = cars["cty"])
plt.xlabel("Miles per gallon (Highway)")
plt.ylabel("Miles per gallon (City)")
plt.title("Fuel efficiency in highways vs the city")
plt.savefig("results/2023-04-24-scatter_cty_hwy.jpg")
Adding group indexing: .groupby() adds a grouping index that can be used for grouped plotting
["..."] extracts a particular column from the dataset.
Note: It's not always necessary to compute summary statistics after .groupby()
grouped_data = cars.groupby(['year'])["displ"]
Grouped Statistics
# Compute frequency
grouped_counts_onelevel = cars.groupby(['class']).size()
grouped_counts_multilevel = cars.groupby(['class','cyl']).size()
# Compute summary statistics by group
grouped_mean = cars.groupby(['class','cyl'])["displ"].mean()
grouped_sum = cars.groupby(['class','manufacturer'])["displ"].sum()
grouped_std = cars.groupby(['class','manufacturer'])["displ"].std()
grouped_min = cars.groupby(['class','manufacturer'])["displ"].min()
grouped_max = cars.groupby(['class','manufacturer'])["displ"].max()
Note:
.groupby() for later plotting.groupby().agg()Histogram by Groups
# To create overlapping histograms, use the "grouped_data" created above
# and the function ".plot.hist()"
# The option "alpha" controls the transparency
grouped_data.plot.hist(alpha = 0.5)
plt.xlabel("Cylinders")
plt.ylabel("Frequency")
plt.title("Histogram of car cylinders")
plt.legend(title = "Year")
<matplotlib.legend.Legend at 0x7f9ec2c1ccd0>
Bar Chart
# A vertical bar plot
grouped_counts_onelevel.plot(kind = "bar")
plt.show()
# A horizontal bar plot
grouped_counts_onelevel.plot(kind = "barh")
plt.show()
Note:
plt.show() is used to display the charts as separate plotsMulti-level Bar Chart
# .unstack("cyl") indicates that the plot should stack the results
# by "cyl" (cylinders).
grouped_counts_multilevel.unstack("cyl").plot(kind = "bar", stacked = True)
plt.title("Stacked Bar Chart")
plt.ylabel("Frequency")
plt.xlabel("Class")
plt.show()
# The grouped bar chart presents a histogram with two groupings, a primary
# and a secondary one. In this case "unstack("cyl")" indicates the secondary
# level
grouped_counts_multilevel.unstack("cyl").plot(kind = "bar", stacked = False)
plt.title("Grouped Bar Chart")
plt.ylabel("Frequency")
plt.ylabel("Class")
plt.show()
Legend Options
# Baseline plot
grouped_counts_multilevel.unstack("cyl").plot(kind = "bar",stacked = True)
plt.title("Stacked bar chart with legend on side")
plt.ylabel("Frequency")
# bbox_to_anchor(x,y) where (x,y) are the coordinates on the graph where values
# between 0 and 1. If the user specifies a value higher
# than one, it's put outside the graph.
# loc: The part of the legend where the point "bbox_to_anchor" is located
# If "bbox_to_anchor" is not specified, this places the legend inside
# the plot in the desired position.
# Best thing is to use different options to test the usage!
plt.legend(title='Cylinders',
bbox_to_anchor=(1, 0.5),
loc='center right',)
<matplotlib.legend.Legend at 0x7f9ec2c1c1f0>
*Example*
cty grouped by year and manufacturermanufacturer as the primary level and year as the secondary levelresults/figure_barchart_stdcty_by_year.pngcar_cty_year = cars.groupby(["manufacturer", "year"])["cty"].std()
car_cty_year.unstack("year").plot(kind = "bar", stacked = True)
plt.savefig("results/figure_barchart_stdycty_by_year.png")
Check which styles are available
plt.style.available
['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']
Set the style. Can switch back by setting: plt.style.use('default')
plt.style.use('Solarize_Light2')
plt.scatter(x = cars["hwy"],y = cars["cty"])
plt.xlabel("Miles per gallon (Highway)")
plt.ylabel("Miles per gallon (City)")
plt.title("Fuel efficiency in highways vs the city")
Text(0.5, 1.0, 'Fuel efficiency in highways vs the city')
*Example*
plt.style.use()plt.style.use("ggplot")
car_cty_year.unstack("year").plot(kind = "bar", stacked = True)
<AxesSubplot: xlabel='manufacturer'>
We can import .csv files
carfeatures = pd.read_csv("data/features.csv")
We can also import other types of files, such as stata (.dta) files and Excel (.xlsx).
carfeatures_dta = pd.read_stata("data/features.dta")
carfeatures_xlsx = pd.read_excel("data/features.xlsx")
We use to following codes to write files:
carfeatures.to_csv("data/features_stored.csv")
carfeatures_dta.to_stata("data/features_stored.dta")
carfeatures_xlsx.to_excel("data/features_stored.xlsx")
We can enter the nae of a data frame to take a glimpse at it
carfeatures
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | |
|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307 | 130 | 3504 | 12.0 | C-1689780 |
| 1 | 15.0 | 8 | 350 | 165 | 3693 | 11.5 | B-1689791 |
| 2 | 18.0 | 8 | 318 | 150 | 3436 | 11.0 | P-1689802 |
| 3 | 16.0 | 8 | 304 | 150 | 3433 | 12.0 | A-1689813 |
| 4 | 17.0 | 8 | 302 | 140 | 3449 | 10.5 | F-1689824 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 27.0 | 4 | 140 | 86 | 2790 | 15.6 | F-1694103 |
| 394 | 44.0 | 4 | 97 | 52 | 2130 | 24.6 | V-1694114 |
| 395 | 32.0 | 4 | 135 | 84 | 2295 | 11.6 | D-1694125 |
| 396 | 28.0 | 4 | 120 | 79 | 2625 | 18.6 | F-1694136 |
| 397 | 31.0 | 4 | 119 | 82 | 2720 | 19.4 | C-1694147 |
398 rows × 7 columns
If we want output data for a single column, we use [...]
carfeatures["cylinders"]
0 8
1 8
2 8
3 8
4 8
..
393 4
394 4
395 4
396 4
397 4
Name: cylinders, Length: 398, dtype: int64
# Antoher example
carfeatures["mpg"]
0 18.0
1 15.0
2 18.0
3 16.0
4 17.0
...
393 27.0
394 44.0
395 32.0
396 28.0
397 31.0
Name: mpg, Length: 398, dtype: float64
We can create a frequency table using the following syntax
pd.crosstab(indenx = df["column"], columns = "Name")
pd.crosstab(index = carfeatures['cylinders'],columns = "How many cars have (x) cylinders?")
| col_0 | How many cars have (x) cylinders? |
|---|---|
| cylinders | |
| 3 | 4 |
| 4 | 204 |
| 5 | 3 |
| 6 | 84 |
| 8 | 103 |
# Another example
pd.crosstab(index = carfeatures['mpg'], columns = "Hello")
| col_0 | Hello |
|---|---|
| mpg | |
| 9.0 | 1 |
| 10.0 | 2 |
| 11.0 | 4 |
| 12.0 | 6 |
| 13.0 | 20 |
| ... | ... |
| 43.4 | 1 |
| 44.0 | 1 |
| 44.3 | 1 |
| 44.6 | 1 |
| 46.6 | 1 |
129 rows × 1 columns
We use the following syntax to compute basic summary statistics for all variables in a data set
carfeatures.describe()
| mpg | cylinders | displacement | weight | acceleration | |
|---|---|---|---|---|---|
| count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
| mean | 23.514573 | 5.454774 | 193.427136 | 2970.424623 | 15.568090 |
| std | 7.815984 | 1.701004 | 104.268683 | 846.841774 | 2.757689 |
| min | 9.000000 | 3.000000 | 68.000000 | 1613.000000 | 8.000000 |
| 25% | 17.500000 | 4.000000 | 104.250000 | 2223.750000 | 13.825000 |
| 50% | 23.000000 | 4.000000 | 148.500000 | 2803.500000 | 15.500000 |
| 75% | 29.000000 | 8.000000 | 262.000000 | 3608.000000 | 17.175000 |
| max | 46.600000 | 8.000000 | 455.000000 | 5140.000000 | 24.800000 |
Create an empty data frame
data = pd.DataFrame([])
Add variables
# The following are lists with values for different individuals
# "age" is the number of years
# "num_underage_siblings" is the total number of underage siblings
# "num_adult_siblings" is the total number of adult siblings
data["age"] = [18,29,15,32,6]
data["num_underage_siblings"] = [0,0,1,1,0]
data["num_adult_siblings"] = [1,0,0,1,0]
Define functions
fn_iseligible_vote = lambda age: age >= 18
fn_istwenties = lambda age: (age >= 20) & (age < 30)
fn_sum = lambda x,y: x + y
def fn_agebracket(age):
if (age >= 18):
status = "Adult"
elif (age >= 10) & (age < 18):
status = "Adolescent"
else:
status = "Child"
return(status)
Applying functions with one argument:
apply(myfunction)
# The fucntion "apply" will extract each element and return the function value
# It is similar to running a "for-loop" over each element
data["can_vote"] = data["age"].apply(fn_iseligible_vote)
data["in_twenties"] = data["age"].apply(fn_istwenties)
data["age_bracket"] = data["age"].apply(fn_agebracket)
# NOTE: The following code also works:
# data["can_vote"] = data["age"].apply(lambda age: age >= 18)
# data["in_twenties"] = data["age"].apply(lambda age: (age >= 20) & (age < 30))
display(data)
| age | num_underage_siblings | num_adult_siblings | can_vote | in_twenties | age_bracket | |
|---|---|---|---|---|---|---|
| 0 | 18 | 0 | 1 | True | False | Adult |
| 1 | 29 | 0 | 0 | True | True | Adult |
| 2 | 15 | 1 | 0 | False | False | Adolescent |
| 3 | 32 | 1 | 1 | True | False | Adult |
| 4 | 6 | 0 | 0 | False | False | Child |
Mapping functions with one or more arguments
list(map(myfunction, list1,list2, ....))
# Repeat the above example with map
# We use list() to convert the output to a list
# The first argument of map() is a function
# The following arguments are the subarguments of the function
data["can_vote"] = list(map(fn_iseligible_vote, data["age"]))
# In this example, there are more than two arguments
data["num_siblings"] = list(map(fn_sum,data["num_underage_siblings"],data["num_adult_siblings"]))
Recommended!
data["num_siblings"] = list(map(fn_sum,
data["num_underage_siblings"],
data["num_adult_siblings"]))
*Example*
check_num_siblings = lambda x: x >= 1
data["has_siblings"] = data["num_siblings"].apply(check_num_siblings)
display(data)
| age | num_underage_siblings | num_adult_siblings | can_vote | in_twenties | age_bracket | num_siblings | has_siblings | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18 | 0 | 1 | True | False | Adult | 1 | True |
| 1 | 29 | 0 | 0 | True | True | Adult | 0 | False |
| 2 | 15 | 1 | 0 | False | False | Adolescent | 1 | True |
| 3 | 32 | 1 | 1 | True | False | Adult | 2 | True |
| 4 | 6 | 0 | 0 | False | False | Child | 0 | False |
*Example*
carfeatures["mpg_above_29"] = list(map(lambda x: x >= 29, carfeatures["mpg"]))
carfeatures.to_csv("data/features_clean.csv")
*Example*
" A {fruit} is {color}"
list_fruits = ["banana","strawberry","kiwi"]
list_colors = ["yellow","red","green"]
fruit_color = lambda fruit, color: f"A {fruit} is {color}."
list_fruits = ["banana","strawberry","kiwi"]
list_colors = ["yellow","red","green"]
list(map(fruit_color, list_fruits, list_colors))
['A banana is yellow.', 'A strawberry is red.', 'A kiwi is green.']
(Optional) External Scripts
".ipynb" files ...
".py" files ...
#------------------------------------------------------------------------------#
# The "%run -i" command executes a Python script that is
# in the subfolder "programs/"
#
# Breaking down the command:
# (a) The percentage sign "%" is associated with "magic commands"
# which are special functions used in iPython notebooks.
# (b) The suboption "-i" ensures that the program can use any variables
# defined in the curent working environment (in case it's necessary)
#------------------------------------------------------------------------------#
%run -i "scripts/example_functions.py"
x = 1
print(fn_quadratic(1))
print(fn_quadratic(5))
1 25
# When we run this program
# the value of alpha will be overwritten
alpha = 1
print(alpha)
%run -i "scripts/example_variables.py"
print(alpha)
1 5
The display() command will show the first 5 rows and the last five rows
display(carfeatures)
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307 | 130 | 3504 | 12.0 | C-1689780 | False |
| 1 | 15.0 | 8 | 350 | 165 | 3693 | 11.5 | B-1689791 | False |
| 2 | 18.0 | 8 | 318 | 150 | 3436 | 11.0 | P-1689802 | False |
| 3 | 16.0 | 8 | 304 | 150 | 3433 | 12.0 | A-1689813 | False |
| 4 | 17.0 | 8 | 302 | 140 | 3449 | 10.5 | F-1689824 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 27.0 | 4 | 140 | 86 | 2790 | 15.6 | F-1694103 | False |
| 394 | 44.0 | 4 | 97 | 52 | 2130 | 24.6 | V-1694114 | True |
| 395 | 32.0 | 4 | 135 | 84 | 2295 | 11.6 | D-1694125 | True |
| 396 | 28.0 | 4 | 120 | 79 | 2625 | 18.6 | F-1694136 | False |
| 397 | 31.0 | 4 | 119 | 82 | 2720 | 19.4 | C-1694147 | True |
398 rows × 8 columns
Extract column names
# Write the name of the dataset and use a period "." to extract
# the attribute "columns" and the subttribute "values"
car_colnames = carfeatures.columns.values
print(car_colnames)
['mpg' 'cylinders' 'displacement' 'horsepower' 'weight' 'acceleration' 'vehicle id' 'mpg_above_29']
Subset columns:
data[list_names]
# To subset multiple columns write the name of the datasets
# and enter a list in square brackets next to the name
list_subsetcols = ["weight","mpg"]
subcols_carfeatures = carfeatures[list_subsetcols]
display(subcols_carfeatures)
# Or we can simply include the list directly inside square brackets
display(carfeatures[["weight","mpg"]])
| weight | mpg | |
|---|---|---|
| 0 | 3504 | 18.0 |
| 1 | 3693 | 15.0 |
| 2 | 3436 | 18.0 |
| 3 | 3433 | 16.0 |
| 4 | 3449 | 17.0 |
| ... | ... | ... |
| 393 | 2790 | 27.0 |
| 394 | 2130 | 44.0 |
| 395 | 2295 | 32.0 |
| 396 | 2625 | 28.0 |
| 397 | 2720 | 31.0 |
398 rows × 2 columns
| weight | mpg | |
|---|---|---|
| 0 | 3504 | 18.0 |
| 1 | 3693 | 15.0 |
| 2 | 3436 | 18.0 |
| 3 | 3433 | 16.0 |
| 4 | 3449 | 17.0 |
| ... | ... | ... |
| 393 | 2790 | 27.0 |
| 394 | 2130 | 44.0 |
| 395 | 2295 | 32.0 |
| 396 | 2625 | 28.0 |
| 397 | 2720 | 31.0 |
398 rows × 2 columns
*Example* Extract the weight and acceleration variables
display(carfeatures[["weight", "acceleration"]])
| weight | acceleration | |
|---|---|---|
| 0 | 3504 | 12.0 |
| 1 | 3693 | 11.5 |
| 2 | 3436 | 11.0 |
| 3 | 3433 | 12.0 |
| 4 | 3449 | 10.5 |
| ... | ... | ... |
| 393 | 2790 | 15.6 |
| 394 | 2130 | 24.6 |
| 395 | 2295 | 11.6 |
| 396 | 2625 | 18.6 |
| 397 | 2720 | 19.4 |
398 rows × 2 columns
Sort by column
carsorted = carfeatures.sort_values(by = "mpg", ascending = False) # False: descending order
display(carsorted)
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 322 | 46.6 | 4 | 86 | 65 | 2110 | 17.9 | M-1693322 | True |
| 329 | 44.6 | 4 | 91 | 67 | 1850 | 13.8 | H-1693399 | True |
| 325 | 44.3 | 4 | 90 | 48 | 2085 | 21.7 | V-1693355 | True |
| 394 | 44.0 | 4 | 97 | 52 | 2130 | 24.6 | V-1694114 | True |
| 326 | 43.4 | 4 | 90 | 48 | 2335 | 23.7 | V-1693366 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 103 | 11.0 | 8 | 400 | 150 | 4997 | 14.0 | C-1690913 | False |
| 67 | 11.0 | 8 | 429 | 208 | 4633 | 11.0 | M-1690517 | False |
| 25 | 10.0 | 8 | 360 | 215 | 4615 | 14.0 | F-1690055 | False |
| 26 | 10.0 | 8 | 307 | 200 | 4376 | 15.0 | C-1690066 | False |
| 28 | 9.0 | 8 | 304 | 193 | 4732 | 18.5 | H-1690088 | False |
398 rows × 8 columns
Subset row(s)
data.iloc[ row_int , : ] $\quad$ or
data.iloc[ list_rows, : ]
# The following command extracts all columns for row zero
# Remember that numbering starts at zero in Python
# In this case we will show the car with the highest "mpg" value
display(carsorted.iloc[0,:])
display(carsorted.iloc[[0,1,2],:])
mpg 46.6 cylinders 4 displacement 86 horsepower 65 weight 2110 acceleration 17.9 vehicle id M-1693322 mpg_above_29 True Name: 322, dtype: object
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 322 | 46.6 | 4 | 86 | 65 | 2110 | 17.9 | M-1693322 | True |
| 329 | 44.6 | 4 | 91 | 67 | 1850 | 13.8 | H-1693399 | True |
| 325 | 44.3 | 4 | 90 | 48 | 2085 | 21.7 | V-1693355 | True |
Subset block of rows
data.iloc[ lower:upper , : ]
# Extract rows 0 to 5
display(carfeatures.iloc[0:5,:])
# Extract rows 8 onwards
display(carfeatures.iloc[8:, : ])
# Note: We can leave the numbers to the left and right of ":" blank
# in order to select all values before or after, respectively.
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307 | 130 | 3504 | 12.0 | C-1689780 | False |
| 1 | 15.0 | 8 | 350 | 165 | 3693 | 11.5 | B-1689791 | False |
| 2 | 18.0 | 8 | 318 | 150 | 3436 | 11.0 | P-1689802 | False |
| 3 | 16.0 | 8 | 304 | 150 | 3433 | 12.0 | A-1689813 | False |
| 4 | 17.0 | 8 | 302 | 140 | 3449 | 10.5 | F-1689824 | False |
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 8 | 14.0 | 8 | 455 | 225 | 4425 | 10.0 | P-1689868 | False |
| 9 | 15.0 | 8 | 390 | 190 | 3850 | 8.5 | A-1689879 | False |
| 10 | 15.0 | 8 | 383 | 170 | 3563 | 10.0 | D-1689890 | False |
| 11 | 14.0 | 8 | 340 | 160 | 3609 | 8.0 | P-1689901 | False |
| 12 | 15.0 | 8 | 400 | 150 | 3761 | 9.5 | C-1689912 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 27.0 | 4 | 140 | 86 | 2790 | 15.6 | F-1694103 | False |
| 394 | 44.0 | 4 | 97 | 52 | 2130 | 24.6 | V-1694114 | True |
| 395 | 32.0 | 4 | 135 | 84 | 2295 | 11.6 | D-1694125 | True |
| 396 | 28.0 | 4 | 120 | 79 | 2625 | 18.6 | F-1694136 | False |
| 397 | 31.0 | 4 | 119 | 82 | 2720 | 19.4 | C-1694147 | True |
390 rows × 8 columns
Similar for columns
data.iloc[ : , col_integer ]data.iloc[ : , list_cols ]data.iloc[ list_rows , list_cols ]*Example*
sorts cars from lowest to highest mpg
HINT: Use sort_values(...,ascending = TRUE)
car_ascendingmpg = carfeatures.sort_values(by = "mpg", ascending = True)
display(car_ascendingmpg.iloc[:5,:])
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | |
|---|---|---|---|---|---|---|---|---|
| 28 | 9.0 | 8 | 304 | 193 | 4732 | 18.5 | H-1690088 | False |
| 25 | 10.0 | 8 | 360 | 215 | 4615 | 14.0 | F-1690055 | False |
| 26 | 10.0 | 8 | 307 | 200 | 4376 | 15.0 | C-1690066 | False |
| 103 | 11.0 | 8 | 400 | 150 | 4997 | 14.0 | C-1690913 | False |
| 124 | 11.0 | 8 | 350 | 180 | 3664 | 11.0 | O-1691144 | False |
Filter the data based on its content
data.query("logical expression")
(i) Expressions with colnames
data_threshold_mpg = carfeatures.query("mpg >= 25")
data_rangeacceleration = carfeatures.query("(acceleration >= 10) & (acceleration < 18)")
(ii) Expressions with colnames + global variables (@)
# You can invoke global variables into the query by using @variablename
# If you don't include @, then Python will try to look for a column with
# that name.
threshold = 25
data_varthreshold_mpg = carfeatures.query("mpg >= @threshold")
(iii) Expressions with colnames with spaces
# Sometimes column names have spaces in them
# In this case use the "`" symbol, e.g. `variable name`
carfeatures["new variable"] = carfeatures["mpg"]
data_spacesthreshold_mpg = carfeatures.query("`new variable` >= 25")
*Example*
display(carfeatures.query("(mpg >= 25) & (cylinders == 8)"))
# To run it with global variables
mpg_threshold = 25
cylinders_threshold = 8
display(carfeatures.query("(mpg >= @mpg_threshold) & (cylinders == @cylinders_threshold)"))
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | new variable | |
|---|---|---|---|---|---|---|---|---|---|
| 364 | 26.6 | 8 | 350 | 105 | 3725 | 19.0 | O-1693784 | False | 26.6 |
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | new variable | |
|---|---|---|---|---|---|---|---|---|---|
| 364 | 26.6 | 8 | 350 | 105 | 3725 | 19.0 | O-1693784 | False | 26.6 |
List of unique categories
# Use pd.unique() to extract a list with the unique elements in that column
list_unique_cylinders = pd.unique(carfeatures["cylinders"])
print(list_unique_cylinders)
[8 4 6 3 5]
Compute two overlapping plots
# If we call plt.scatter() twice to display two plots
# To display all plots simultaneously we include plt.show() at the very end.
# The idea is that the graphs are stacked on top of each other
df_8 = carfeatures.query("cylinders == 8")
df_4 = carfeatures.query("cylinders == 4")
plt.scatter(x = df_8["weight"],y = df_8["acceleration"])
plt.scatter(x = df_4["weight"],y = df_4["acceleration"])
plt.legend(labels = ["8","4"], title = "Cylinders")
plt.show()
# Note: If we put plt.show() in between the plots, then the results will
# be shown on separate graphs instead.
Compute plots by all categories
# Compute number of unique categories
list_unique_cylinders = pd.unique(carfeatures["cylinders"])
# Use a for loop to plot a scatter plot between "weight" and "acceleration"
# for each category. Each plot will have a different color
for category in list_unique_cylinders:
df = carfeatures.query("cylinders == @category")
plt.scatter(x = df["weight"],y = df["acceleration"])
# Add labels and a legends
plt.xlabel("Weight")
plt.ylabel("Acceleration")
plt.legend(labels = list_unique_cylinders,
title = "Cylinders")
plt.show()
*Example*
alpha inplt.hist(x = ..., alpha = 0.5)
list_unique_cylinders = pd.unique(carfeatures["cylinders"])
for cylinder in list_unique_cylinders:
df = carfeatures.query("cylinders == @cylinder")
plt.hist(x = df["mpg"], alpha = 0.5)
plt.xlabel("mpg")
plt.ylabel("Frequency")
plt.legend(labels = list_unique_cylinders, title = "Cylinders")
plt.show()
Random assignment is crucial for scientific progress ...
# "list_status" is a list with "treatment/control" arms
# "prop_status" is the proportion in the treatment and control arms
# "size_dataset" is how many rows are contained
list_status = ["Treatment","Control"]
prop_status = [0.4,0.6]
size_dataset = len(carfeatures)
Random assignment
# The "np.random.choice" will create a vector with the status
# We will save this to a column in "carfeatures"
# Note: (i) We can always split the arguments of a function in multiple lines
# to make it easier to read
# (ii)
carfeatures["status"] = np.random.choice(list_status,
size = size_dataset,
p = prop_status)
display(carfeatures)
| mpg | cylinders | displacement | horsepower | weight | acceleration | vehicle id | mpg_above_29 | new variable | status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307 | 130 | 3504 | 12.0 | C-1689780 | False | 18.0 | Control |
| 1 | 15.0 | 8 | 350 | 165 | 3693 | 11.5 | B-1689791 | False | 15.0 | Treatment |
| 2 | 18.0 | 8 | 318 | 150 | 3436 | 11.0 | P-1689802 | False | 18.0 | Treatment |
| 3 | 16.0 | 8 | 304 | 150 | 3433 | 12.0 | A-1689813 | False | 16.0 | Control |
| 4 | 17.0 | 8 | 302 | 140 | 3449 | 10.5 | F-1689824 | False | 17.0 | Treatment |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | 27.0 | 4 | 140 | 86 | 2790 | 15.6 | F-1694103 | False | 27.0 | Treatment |
| 394 | 44.0 | 4 | 97 | 52 | 2130 | 24.6 | V-1694114 | True | 44.0 | Control |
| 395 | 32.0 | 4 | 135 | 84 | 2295 | 11.6 | D-1694125 | True | 32.0 | Control |
| 396 | 28.0 | 4 | 120 | 79 | 2625 | 18.6 | F-1694136 | False | 28.0 | Control |
| 397 | 31.0 | 4 | 119 | 82 | 2720 | 19.4 | C-1694147 | True | 31.0 | Control |
398 rows × 10 columns
Compute frequencies by status
# The command "pd.crosstab" computes frequencies
# If we add the option "normalize" it will compute proportions
# Note: The default assignment is done randomly without replacement
# which means that the proportions are approximately the same
# (but not equal) to "prop_status"
frequency_table = pd.crosstab(index = carfeatures["status"], columns = "Frequency")
proportions_table = pd.crosstab(index = carfeatures["status"],
columns = "Frequency",
normalize = True)
display(frequency_table)
display(proportions_table)
| col_0 | Frequency |
|---|---|
| status | |
| Control | 233 |
| Treatment | 165 |
| col_0 | Frequency |
|---|---|
| status | |
| Control | 0.585427 |
| Treatment | 0.414573 |
Query with string conditions
# When you have queries for text variables, it's important
# to use outer ' ' single quotations
# and inner double quotations.
data_treated = carfeatures.query('status == "Treatment" ')
data_control = carfeatures.query('status == "Control" ')
Treated/control should be similar
# The count is different because we assigned different proportions
# All other sumary statistics are approximately the same
# They are not identical because the assignment is random
display(data_treated.describe())
display(data_control.describe())
| mpg | cylinders | displacement | weight | acceleration | new variable | |
|---|---|---|---|---|---|---|
| count | 165.000000 | 165.000000 | 165.000000 | 165.000000 | 165.000000 | 165.000000 |
| mean | 22.605455 | 5.600000 | 203.927273 | 3060.224242 | 15.503030 | 22.605455 |
| std | 7.451144 | 1.717201 | 106.798281 | 887.013772 | 2.657925 | 7.451144 |
| min | 9.000000 | 3.000000 | 70.000000 | 1800.000000 | 9.000000 | 9.000000 |
| 25% | 17.000000 | 4.000000 | 101.000000 | 2220.000000 | 13.700000 | 17.000000 |
| 50% | 21.100000 | 6.000000 | 168.000000 | 2904.000000 | 15.500000 | 21.100000 |
| 75% | 27.500000 | 8.000000 | 302.000000 | 3725.000000 | 17.000000 | 27.500000 |
| max | 46.600000 | 8.000000 | 455.000000 | 5140.000000 | 22.200000 | 46.600000 |
| mpg | cylinders | displacement | weight | acceleration | new variable | |
|---|---|---|---|---|---|---|
| count | 233.000000 | 233.000000 | 233.000000 | 233.000000 | 233.000000 | 233.000000 |
| mean | 24.158369 | 5.351931 | 185.991416 | 2906.832618 | 15.614163 | 24.158369 |
| std | 8.017875 | 1.685565 | 102.016944 | 813.141144 | 2.830973 | 8.017875 |
| min | 10.000000 | 3.000000 | 68.000000 | 1613.000000 | 8.000000 | 10.000000 |
| 25% | 18.000000 | 4.000000 | 105.000000 | 2234.000000 | 13.900000 | 18.000000 |
| 50% | 23.500000 | 4.000000 | 140.000000 | 2720.000000 | 15.400000 | 23.500000 |
| 75% | 31.000000 | 6.000000 | 250.000000 | 3445.000000 | 17.300000 | 31.000000 |
| max | 44.600000 | 8.000000 | 455.000000 | 4997.000000 | 24.800000 | 44.600000 |
To clean a data set, we first see the types of it.
# Produces a list with the data types of each column
# Columns that say "object" have either strings or
# a mix of string and numeric values
circuits.dtypes
circuitId int64 circuitRef object name object location object country object lat float64 lng float64 alt object url object dtype: object
From the code book, we know alt should be numeric data, but it is an object (text string) according to the last output.
Hence, we use .str.isnumeric() to check which values of alt is not numeric.
# The ".str.isnumeric()" checks whether each row is numeric or now.
# Using the "." twice is an example of "piping"
# which refers to chaining two commands "str" and "isnumeric()"
circuits["alt"].str.isnumeric()
0 True
1 True
2 True
3 True
4 True
...
72 True
73 True
74 True
75 False
76 False
Name: alt, Length: 77, dtype: bool
To extract those rows of non-numeric values, we combine the .str.isnumeric() function with .query().
# We can reference subattributes of columns in ".query()"
# The pd.unique() function extracts unique values from a list
subset = circuits.query("alt.str.isnumeric() == False")
list_unique = pd.unique(subset["alt"])
print(list_unique)
['\\N' '-7']
To clean the data set, we change the non-numeric values to numeric values.
# "list_old" encodes values we want to change
# "list_new" encodes the values that will "replace" the old
list_old = ['\\N','-7']
list_new = [np.nan,-7]
# This command replaces the values of the "alt" column
circuits["alt"] = circuits["alt"].replace(list_old, list_new)
# Note: The option "inplace = True" permanently modifies the column
# circuits["alt"].replace(list_old, list_new, inplace = True)
*Example*
.replace() with the "country" columncircuits["country"] = circuits["country"].replace("UK", "United Kingdom")
*Example*
lat or long?.query() + .str.isnumeric()# Write your own code
# subset_lat = circuits.query("lat.str.isnumeric() == False")
# subset_long = circuits.query("lng.str.isnumeric() == False")
# ERROR: Only use .str accessor with string values! --> no string values in lat or long
circuits[["lat", "lng"]].dtypes
lat float64 lng float64 dtype: object
Sometimes, we need to recode the values. The very common one is to convert values from object to numeric (text strings will be recoded as NaNs after conversion).
circuits["alt_numeric"] = pd.to_numeric(circuits["alt"])
print(circuits["alt_numeric"].mean())
248.1891891891892
In other cases, the recoding process might be more complicated.
*Example*
Recode values based on an interval
$$ \qquad x_{bin} = \begin{cases} `A' &\text{ if }\quad x_1 < x \le x_2 \\ `B' &\text{ if }\quad x_2 < x \le x_3 \end{cases} $$# bins and labels must increase monotonically
bins_x = [0, 2500, 5000]
labels_x = ["Between 0 and 2500",
"Between 2500 and 5000"]
circuits["bins_alt"] = pd.cut(circuits["alt_numeric"],
bins = bins_x,
right = True,
labels = labels_x)
# Note: if we set bins_x = [float("-inf"), 2500, float("inf")]
# then intervals are "Less than or equal to 2500" and "Above 2500"
# float("inf") and float("-inf") represent infinity and negative infinity
# The "right" command indicates that the right interval is
# "less than or equal to"
*Example*
bins_lat = [-90, 0, 90]
labels_lat = ["south", "north"]
circuits["hemisphere"] = pd.cut(circuits["lat"],
bins = bins_lat,
right = True,
labels = labels_lat)
In this section, we will use the result data set. Let's first see its rows and data types
results.dtypes
resultId int64 raceId int64 driverId int64 constructorId int64 number object grid int64 position object positionText object positionOrder int64 points float64 laps int64 time object milliseconds object fastestLap object rank object fastestLapTime object fastestLapSpeed object statusId int64 dtype: object
Further, we will answer the following questions:
$\qquad$ "resultId"?
$\qquad$ "raceId"?
$\qquad$ "driverId"?
HINT: Use the "len()" and the "pd.unique()" functions
print(len(results))
print(len(pd.unique(results["resultId"]))) # Primary key -- unique identifiers of the data set
print(len(pd.unique(results["raceId"])))
print(len(pd.unique(results["driverId"])))
25840 25840 1079 855
Splitting code into multiple lines
# The following code computes descriptive statistics for points
# Wrapping the code in parentheses "()" allows you to split it into multiple
# lines. It's considered good practice to make each line less than 80 characters
# This makes it easier to scroll up and down without going sideways.
descriptives_singleline = results["points"].describe()
descriptives_multiline = (results["points"]
.describe())
print(descriptives_multiline)
print(descriptives_singleline) # They are the same!
count 25840.000000 mean 1.877053 std 4.169849 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 50.000000 Name: points, dtype: float64 count 25840.000000 mean 1.877053 std 4.169849 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 50.000000 Name: points, dtype: float64
The .agg() subfunction computes aggregate statistics
# The ".agg()" subfunction computes aggregate statistics
# The syntax is ("column_name","function_name")
# The first argument is the column name
# The second argument is the function_name
# The command works with single quotations '...' or double "..."
results_agg = results.agg(mean_points = ('points','mean'),
sd_points = ('points','std'),
min_points = ('points','min'),
max_points = ('points','max'),
count_obs = ('points',len))
display(results_agg)
| points | |
|---|---|
| mean_points | 1.877053 |
| sd_points | 4.169849 |
| min_points | 0.000000 |
| max_points | 50.000000 |
| count_obs | 25840.000000 |
We can further apply groupby() to our code.
# In this cases drivers engage in multiple car races
# We can compute the aggregate statistics for each specific driver across
# multiple car races
drivers_agg = (results.groupby("driverId")
.agg(mean_points = ('points','mean'),
sd_points = ('points','std'),
min_points = ('points','min'),
max_points = ('points','max'),
count_obs = ('points',len)))
len(drivers_agg)
855
We can group by multiple groups
# We can aggregate statistics from multiple columns by
# entering a list of column names in "groupby"
# In this case "constructor" in this case denotes the team
# The following computes aggregate point stats for each (team, race) combination
teamrace_agg = (results.groupby(["raceId","constructorId"])
.agg(mean_points = ('points','mean'),
sd_points = ('points','std'),
min_points = ('points','min'),
max_points = ('points','max'),
count_obs = ('points',len)))
len(teamrace_agg)
12568
We can combine group by and query.
# The following gets a subset of the data using .query()
# In this case we subset the data before computing aggregate statistics
# Note: "filtering" is often the word used to obtain a subset
teamrace_agg = (results.query("raceId >= 500")
.groupby(["raceId","constructorId"])
.agg(mean_points = ('points','mean'),
sd_points = ('points','std'),
min_points = ('points','min'),
max_points = ('points','max'),
count_obs = ('points',len)))
*Example*
race_agg = (results.groupby("raceId")
.agg(mean_points = ("points", "mean"),
mean_laps = ("laps", "mean")))
len(race_agg)
1079
*Example*
constructor_sorted_agg = (results.groupby("constructorId")
.agg(mean_points = ("points", "mean"))
.sort_values("mean_points", ascending = False))
(*Optional*) We can compute relative statistics within group using the merge() function.
# This command merges the "aggregate" information in "driver_agg" into
# "results" as shown in the figure
# The merging variable "on" is determined by "driverId", which is a column
# that is common to both datasets
# "how = left" indicates that the left dataset is the baseline
#
# Note: For this method to work well "driverId" needs to contain unique alues
# in "drivers_agg". If not you may need to clean the data beforehand
results_merge = pd.merge(results,
drivers_agg,
on = "driverId",
how = "left")
Also, we could use the transform() function to achieve the same result.
# We've use "transform" to compute a column with aggregate statistics
# If we add the pipe "groupby" the aggregate statistics are computed by group
# with group level averages.
# We can use any aggregate function, including "mean", "std", "max","min", etc.
results["mean_points_driver"] = results.groupby("driverId")["points"].transform("mean")
results["std_points_driver"] = results.groupby("driverId")["points"].transform("std")
We can use transform() to compute ranks
# The rank function calculates the relative position
# The option 'method = "dense"' options gives multiple individuals
# the same rank if there is a tie
# The option 'ascending = False' indicates the the person with the lowest
# score is "1", the second lowest is "2", etc.
results["rank_points"] = results["points"].rank(method = "dense",
ascending = False)
# The graph shows that the winner gets 50 points
plt.scatter(x = results["rank_points"],y = results["points"])
plt.ylabel("Number of Points")
plt.xlabel("Relative Ranking")
Text(0.5, 0, 'Relative Ranking')
The following code computes ranks by group
# The subfunction "transform" allows us to pass-along some of the options
results["rank_points_withinteam"] = (results.groupby("constructorId")["points"]
.transform("rank",
method = "dense",
ascending = True))
*Example*
Note: This plots tells you how much a driver's performance on individual races deviates from their overall average
plt.scatter(x = results_merge["mean_points"], y = results_merge["points"])
plt.xlabel("Mean Points")
plt.ylabel("Points")
plt.show()
*Example*
$\qquad$ on = ["raceId","constructorId"]
results_teamrace_merge = pd.merge(results,
teamrace_agg,
on = ["raceId", "constructorId"],
how = "left")
len(results_teamrace_merge)
25840
Before we start merging data, we need to check columns with similar names.
# We extract all the unique values in races_raw["name"] and circuits_raw["name"]
# We use "sort_values()" to make it easier to compare the variables
# The "codebook/f1_codebook.pdf" file shows that the content is indeed different
unique_data_races = pd.unique(races_raw["name"].sort_values())
unique_data_circuits = pd.unique(circuits_raw["name"].sort_values())
We need to rename those similar names so we will not be confused after the merging.
Renaming columns' names, we will use dictionaries.
# We first define the dictionary
# Change the pipe ".rename(...)" to rename the columns
# Dictionaries can flexibly accommodate single values or list after ":"
dict_rename_circuits = {"name": "circuit_name"}
circuits = circuits_raw.rename(columns = dict_rename_circuits)
# Extract the column names of the "raw" and "clean" data
print("Old List:")
print(circuits_raw.columns.values)
print("")
print("New List:")
print(circuits.columns.values)
Old List: ['circuitId' 'circuitRef' 'name' 'location' 'country' 'lat' 'lng' 'alt' 'url'] New List: ['circuitId' 'circuitRef' 'circuit_name' 'location' 'country' 'lat' 'lng' 'alt' 'url']
*Example*
dict_rename_race = {"name": "race_name"}
races = races_raw.rename(columns = dict_rename_race)
# Check rename works
print(f"Old names: \n {races_raw.columns.values}\n")
print(f"New names: \n {races.columns.values}")
Old names: ['raceId' 'year' 'round' 'circuitId' 'name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time'] New names: ['raceId' 'year' 'round' 'circuitId' 'race_name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time']
Now, we can actually start merging data sets!
# We can extract multiple columns of a data set by creating
# a list of those clumns' names.
circuits[["circuitId","circuit_name"]]
| circuitId | circuit_name | |
|---|---|---|
| 0 | 1 | Albert Park Grand Prix Circuit |
| 1 | 2 | Sepang International Circuit |
| 2 | 3 | Bahrain International Circuit |
| 3 | 4 | Circuit de Barcelona-Catalunya |
| 4 | 5 | Istanbul Park |
| ... | ... | ... |
| 72 | 75 | Autódromo Internacional do Algarve |
| 73 | 76 | Autodromo Internazionale del Mugello |
| 74 | 77 | Jeddah Corniche Circuit |
| 75 | 78 | Losail International Circuit |
| 76 | 79 | Miami International Autodrome |
77 rows × 2 columns
Merge datasets
pd.merge(data1,data2,on,how)
# The "pd.merge()" command combines the information from both datasets
# The first argument is the "primary" datasets
# The second argument is the "secondary" dataset (much include the "on" column)
# The "on" is the common variable that is used for merging
# how = "left" tells Python that the left dataset is the primary one
races_merge = pd.merge(races_raw,
circuits[["circuitId","circuit_name"]],
on = "circuitId",
how = "left")
display(races_merge[["raceId", "circuitId", "circuit_name"]].sort_values(by = "circuitId"))
print(f"Old columns:\n{races.columns.values}\n\nNew columns:\n{races_merge.columns.values}")
| raceId | circuitId | circuit_name | |
|---|---|---|---|
| 0 | 1 | 1 | Albert Park Grand Prix Circuit |
| 54 | 55 | 1 | Albert Park Grand Prix Circuit |
| 70 | 71 | 1 | Albert Park Grand Prix Circuit |
| 858 | 860 | 1 | Albert Park Grand Prix Circuit |
| 89 | 90 | 1 | Albert Park Grand Prix Circuit |
| ... | ... | ... | ... |
| 1096 | 1115 | 78 | Losail International Circuit |
| 1038 | 1051 | 78 | Losail International Circuit |
| 1083 | 1102 | 79 | Miami International Autodrome |
| 1061 | 1078 | 79 | Miami International Autodrome |
| 1100 | 1119 | 80 | Las Vegas Strip Street Circuit |
1102 rows × 3 columns
Old columns: ['raceId' 'year' 'round' 'circuitId' 'race_name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time'] New columns: ['raceId' 'year' 'round' 'circuitId' 'name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time' 'circuit_name']
# Another example of merging
results_merge = pd.merge(results_raw,
races_raw[["raceId","date"]],
on = "raceId",
how = "left")
print(f"Old columns:\n{results_raw.columns.values}\n\nNew columns:\n{results_merge.columns.values}")
Old columns: ['resultId' 'raceId' 'driverId' 'constructorId' 'number' 'grid' 'position' 'positionText' 'positionOrder' 'points' 'laps' 'time' 'milliseconds' 'fastestLap' 'rank' 'fastestLapTime' 'fastestLapSpeed' 'statusId'] New columns: ['resultId' 'raceId' 'driverId' 'constructorId' 'number' 'grid' 'position' 'positionText' 'positionOrder' 'points' 'laps' 'time' 'milliseconds' 'fastestLap' 'rank' 'fastestLapTime' 'fastestLapSpeed' 'statusId' 'date']
If we don't deal with similar names before merging, the merge function sill still work.
However, the columns names will be renamed to column_x and column_y by Python. Sometimes, those names are not self-explanatory enough, and we will be confused by the names afterwards. That's why renaming before merging is very important.
# The following code merges the raw data
# which has the "name" column in "races_raw" and "circuits_raw"
races_merge_pitfall = pd.merge(races_raw,
circuits_raw[["circuitId","name"]],
on = "circuitId",
how = "left")
# Python will internally rename the columns "name_x" (for the left dataset)
# and "name_y" (for the right dataset)
print(races_merge_pitfall.columns.values)
['raceId' 'year' 'round' 'circuitId' 'name_x' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time' 'name_y']
*Example*
$\quad$ HINT: Create a dictionary and use "pd.rename()"
dict_rename_races_pitfall = {"name_x": "race_name","name_y": "circuit_name"}
races_pitfall = races_merge_pitfall.rename(columns = dict_rename_races_pitfall)
# Check the rename works
print(f"Old columns:\n{races_merge_pitfall.columns.values}\n\nNew columns:\n{races_pitfall.columns.values}")
Old columns: ['raceId' 'year' 'round' 'circuitId' 'name_x' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time' 'name_y'] New columns: ['raceId' 'year' 'round' 'circuitId' 'race_name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time' 'circuit_name']
*Example*
races_merge_location = pd.merge(races,
circuits[["circuitId", "alt", "lng", "lat"]],
on = "circuitId",
how = "left")
# Check the merge works
print(f"Old columns:\n{races.columns.values}\n\nNew columns:\n{races_merge_location.columns.values}")
Old columns: ['raceId' 'year' 'round' 'circuitId' 'race_name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time'] New columns: ['raceId' 'year' 'round' 'circuitId' 'race_name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time' 'alt' 'lng' 'lat']
Similar to merging, we can CONCAT data sets.
To prepare concatenate, use ".query()" to split data into different parts
circuits_spain = circuits.query('country == "Spain"')
circuits_usa = circuits.query('country == "United States" | country == "USA"')
circuits_malaysia = circuits.query('country == "Malaysia"')
Then, we will concatenate data back together
# Works best if columns are identical
# There are also other advanced options if they are not
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
circuits_concat = pd.concat([circuits_spain, circuits_usa])
*Example*
circuits_concat_usMalaysia = pd.concat([circuits_usa, circuits_malaysia])
We have previously encountered the concept of chaining (or piping), but in this section, we will formally practice our ability to code using chaining.
# In preperation, create a new column in results
results["points col"] = results["points"]
Review Dataset Operations
See attached file for a refresher on syntax
[] $\qquad \qquad \qquad \quad$: Extracting columns
.query() $\qquad \qquad $: Subsetting rows
.recode() $ \qquad \quad \ \ $: Replacing values
.groupby().agg() Aggregate statistics by subgroup
.rename() $\qquad \quad \ \ $: Change name of columns
Full list: https://www.w3schools.com/python/pandas/pandas_ref_dataframe.asp
The operations with "." are read left to right
*Example*
Subsetting before extracting columns
results.query('points >= 20')[["driverId","points"]]
| driverId | points | |
|---|---|---|
| 20320 | 4 | 25.0 |
| 20344 | 18 | 25.0 |
| 20368 | 20 | 25.0 |
| 20392 | 18 | 25.0 |
| 20416 | 17 | 25.0 |
| ... | ... | ... |
| 25740 | 830 | 25.0 |
| 25760 | 830 | 25.0 |
| 25780 | 830 | 25.0 |
| 25800 | 847 | 26.0 |
| 25820 | 830 | 25.0 |
262 rows × 2 columns
*Example*
Subsetting before aggregating
# This obtains a subset of drivers who competed in races 500 onwards
# then computes the average by team ("constructorId")
(results.query('raceId >= 500')
.groupby("constructorId")
.agg(mean_points = ("points","mean")))
| mean_points | |
|---|---|
| constructorId | |
| 1 | 3.148148 |
| 3 | 1.904924 |
| 4 | 1.903226 |
| 5 | 1.203911 |
| 6 | 4.910966 |
| ... | ... |
| 209 | 0.012821 |
| 210 | 0.809028 |
| 211 | 3.723684 |
| 213 | 2.327869 |
| 214 | 3.693182 |
176 rows × 1 columns
*Example*
Subsetting after aggregating
# This obtains the average points by team ("constructorId"), then
# produces a subset of team whose average is higher than 10
(results.groupby("constructorId")
.agg(mean_points = ("points","mean"))
.query('mean_points >= 10'))
| mean_points | |
|---|---|
| constructorId | |
| 131 | 12.363643 |
*Example*
Chaining inside queries + NaNs
is.na() produces a True/False vector, checking for missing valuesis.notna() produces a True/False vector, checking for non-missing values.str.isnumeric()# "is.na()" produces a True/False vector, checking for missing values
# "is.notna()" produces a True/False vector, checking for non-missing values
# .str.isnumeric()
results["points"].isna()
results["points"].notna()
subset_nas = results.query('points.isna()')
subset_nonnas = results.query('points.notna()')
url_server = URL.create(
"postgresql",
host = "localhost",
database = "postgres",
username = "postgres",
password = "12345")
con = sa.create_engine(url_server).connect()
# Import the "results" table, with the name "results_sql"
# Con is an argument to specify the server connection
# "if_exists" is an option to replace the table if it already exists
# You can choose any name instead of "results_sql"
results.to_sql("results_sql",
con = con,
if_exists = "replace")
# Import "races"
races.to_sql("races_sql", con, if_exists = "replace")
102
*Example*:
circuits.to_sql("circuits_sql", con, if_exists = "replace")
77
Importing data from SQL: Use pd.read_sql()
# Extract all data from a column
example1 = pd.read_sql(text("SELECT * \
FROM results_sql;"), con)
# Extract a subset of columns
example2 = pd.read_sql(text("SELECT points \
FROM results_sql;"), con)
# Subset based on a string condition
example3 = pd.read_sql(text("SELECT * \
FROM races_sql \
WHERE name = 'Abu Dhabi Grand Prix';"), con)
*Note*:
Upper case columns
"driverId".read_sql() requires a string inside a string\"driverId\"# Select a column
example4 = pd.read_sql(text("SELECT \"driverId\" \
FROM results_sql;"), con)
# Compute an aggregate statistic
example5 = pd.read_sql(text("SELECT AVG(points) AS mean_points \
FROM results_sql \
GROUP BY \"driverId\" ;"), con)
Merge two data sets
# Merge
# Reference the column \"driverId\" with escape characters
example6 = pd.read_sql(text("SELECT * \
FROM results_sql \
LEFT JOIN races_sql \
ON results_sql.\"raceId\" = races_sql.\"raceId\" ;"), con)
# Merge a subset of columns
# Use "results_sql.*" to select all columns from the primary dataset
# Use "races_sql.date" to only select the "date" column from the secondary dataset
example7 = pd.read_sql(text("SELECT results_sql.*, races_sql.date \
FROM results_sql \
LEFT JOIN races_sql \
ON results_sql.\"raceId\" = races_sql.\"raceId\" ;"), con)
*Example*
pd.read_sql() commandraceIdexample8 = pd.read_sql(text("SELECT \"raceId\", SUM(points) AS sum_points \
FROM results_sql \
GROUP BY \"raceId\";"), con)
*Example*
pd.read_sql() commandLEFT JOINexample9 = pd.read_sql(text("SELECT * FROM races_sql \
LEFT JOIN circuits_sql \
ON races_sql.\"circuitId\" = circuits_sql.\"circuitId\"; "),
con)
More about SQL syntax
Data can come in a wide variety of formats
Wide to long
$\quad$ 
financial_long = pd.melt(financial,
var_name = "portfolio_type",
value_name = "portfolio_value",
id_vars='date',
value_vars=['sp500','djia'])
Long to wide
$\quad$ 
financial_wide = (pd.pivot(financial_long,
index = 'date',
columns = 'portfolio_type',
values = 'portfolio_value'))
# If you also want the index to be part of the dataset add
# ".reset_index()" to the end of the previous command
*Example*
growth_long = pd.melt(financial,
var_name = "protfolio_type",
value_name = "portfolio_growth",
id_vars = "date",
value_vars = ["growth_sp500", "growth_djia"])
Count frequencies
bills_actions["category"].value_counts()
amendment 1529 house bill 902 senate bill 514 house resolution 234 senate resolution 60 house joint resolution 22 house concurrent resolution 20 senate concurrent resolution 14 senate joint resolution 8 Name: category, dtype: int64
Subset text categories:
For this analysis we are only interested in bills. With .query() ...We select all entries in the column called category which have values contain in list_categories
list_categories = ["house bill", "senate bill"]
bills = bills_actions.query('category in @list_categories')
# Verify that the code worked:
bills["category"].value_counts()
house bill 902 senate bill 514 Name: category, dtype: int64
Data manipulation with sentences
# How many bills mention the word Senator?
bool_contains = bills["action"].str.contains("Senator")
print(bool_contains.mean())
# How to replace the word "Senator" with "Custom Title"
bills["action"].str.replace("Senator", "Custom Title")
0.3199152542372881
3 Committee on Health, Education, Labor, and Pen...
4 Committee on the Judiciary. Reported by Custom...
5 Committee on the Judiciary. Reported by Custom...
6 Committee on Commerce, Science, and Transporta...
7 Committee on Veterans' Affairs. Reported by Cu...
...
3262 Mr. Blumenauer moved to suspend the rules and ...
3263 At the conclusion of debate, the chair put the...
3264 Ms. Hill (CA) moved to suspend the rules and p...
3265 Mr. Barr moved to recommit with instructions t...
3280 Mr. Pallone moved that the Committee rise.
Name: action, Length: 1416, dtype: object
*Example*
Obtain a new dataset called resolutions which subsets rows contain the "category" values:
["house resolution","senate resolution"]
check_resolution = bills_actions["category"].str.contains("house resolution")
print(check_resolution.mean())
resolution1 = bills_actions.query(
'category.str.contains("house resolution") | category.str.contains("senate resolution")'
)
list_resolutions = ["house resolution", "senate resolution"]
resolutions2 = bills_actions.query('category in @list_resolutions')
resolutions3 = bills_actions.query(
'category in ["house resolution","senate resolution"]')
house resolution 234 senate resolution 60 Name: category, dtype: int64
Regular expressions enable advanced searching for string data.
senate_bills = bills_actions.query('category == "senate bill"')
amendments = bills_actions.query('category == "amendment"')
display(amendments["action"])
0 S.Amdt.1274 Amendment SA 1274 proposed by Sena...
1 S.Amdt.2698 Amendment SA 2698 proposed by Sena...
2 S.Amdt.2659 Amendment SA 2659 proposed by Sena...
8 S.Amdt.2424 Amendment SA 2424 proposed by Sena...
11 S.Amdt.1275 Amendment SA 1275 proposed by Sena...
...
3298 H.Amdt.172 Amendment (A004) offered by Ms. Kus...
3299 H.Amdt.171 Amendment (A003) offered by Ms. Hou...
3300 H.Amdt.170 Amendment (A002) offered by Ms. Oma...
3301 POSTPONED PROCEEDINGS - At the conclusion of d...
3302 H.Amdt.169 Amendment (A001) offered by Mr. Esp...
Name: action, Length: 1529, dtype: object
Search Word:
We use the .str.findall() subfunction the argument is an expression
amendments["action"].str.findall("Amdt\.")
0 [Amdt.]
1 [Amdt.]
2 [Amdt.]
8 [Amdt.]
11 [Amdt.]
...
3298 [Amdt.]
3299 [Amdt.]
3300 [Amdt.]
3301 []
3302 [Amdt.]
Name: action, Length: 1529, dtype: object
Wildcards
$\quad$ 
# Get digits after string
example1 = amendments["action"].str.findall("Amdt\..")
# Get any character before string
example2 = amendments["action"].str.findall(".Amdt\.")
# Get two characters before string
example3 = amendments["action"].str.findall("..Amdt\...............")
#display(example1)
#display(example2)
display(example3)
0 [S.Amdt.1274 Amendment]
1 [S.Amdt.2698 Amendment]
2 [S.Amdt.2659 Amendment]
8 [S.Amdt.2424 Amendment]
11 [S.Amdt.1275 Amendment]
...
3298 [H.Amdt.172 Amendment ]
3299 [H.Amdt.171 Amendment ]
3300 [H.Amdt.170 Amendment ]
3301 []
3302 [H.Amdt.169 Amendment ]
Name: action, Length: 1529, dtype: object
Wildcards + Quantifiers
$\quad$ 
# Get all consecutive digits after string
example4 = amendments["action"].str.findall("Amdt\.\d*")
# Get all consecutive characters before string
example5 = amendments["action"].str.findall(".*Amdt\.")
#display(example4)
display(example5)
0 [S.Amdt.]
1 [S.Amdt.]
2 [S.Amdt.]
8 [S.Amdt.]
11 [S.Amdt.]
...
3298 [H.Amdt.]
3299 [H.Amdt.]
3300 [H.Amdt.]
3301 []
3302 [H.Amdt.]
Name: action, Length: 1529, dtype: object
*Example*
senate_bills dataset.str.findall() to find the word "Senator""Senator \S to extract the the first letter of senator* to extract senator namessenate_bills["action"].str.findall("Senator \S*")
3 [Senator Alexander]
4 [Senator Graham]
5 [Senator Graham]
6 [Senator Wicker]
7 [Senator Moran]
...
795 [Senator Johnson]
796 []
797 [Senator Hoeven]
798 []
799 [Senator Graham]
Name: action, Length: 514, dtype: object
Consider a sample with $n$ observations
$$ X = \begin{pmatrix} X_1 \\ X_2 \\ \vdots \\ X_n \end{pmatrix}.$$We can simulate from different distributions
# Set Sample size
# These produce several common distributions
# A normam with "loc" and standard deviation "5"
# A chi-square with "df" degrees of freedom
# A uniform with values between -3 and 5
n = 10000
vec_normal = np.random.normal(loc = 5, scale = 5, size = n)
vec_chisqr = np.random.chisquare(df = 1, size = n)
vec_unif = np.random.uniform(low = -3,high = 5, size = n)
The sample average is defined as
$$ \overline{X} = \frac{1}{n}\sum_{i=1}^n X_i $$print(vec_normal.mean())
print(vec_chisqr.mean())
print(vec_unif.mean())
5.006422876828413 1.0056585403833294 1.0104836009802556
Multiple plots in a row (subplot)
fig, list_subfig = plt.subplots(1, 2,figsize = (6,3))
plt.tight_layout()
list_subfig[0].hist(x = vec_normal)
list_subfig[0].set_title("Normal Distribution")
list_subfig[0].set_xlabel("Value")
list_subfig[0].set_ylabel("Frequency")
list_subfig[1].hist(x = vec_unif)
list_subfig[1].set_title("Uniform Distribution")
list_subfig[1].set_xlabel("Value")
#list_subfig[1].set_ylabel("Frequency")
Text(0.5, 3.722222222222216, 'Value')
*Excerise*
fig, list_subfig = plt.subplots(1, 3, figsize = (9,2))
plt.tight_layout()
list_subfig[0].hist(x = vec_normal)
list_subfig[0].set_title("Normal Distribution")
list_subfig[0].set_xlabel("Value")
list_subfig[0].set_ylabel("Frequency")
list_subfig[1].hist(x = vec_chisqr)
list_subfig[1].set_title("Chi-Square Distribution")
list_subfig[1].set_xlabel("Value")
#list_subfig[1].set_ylabel("Frequency")
list_subfig[2].hist(x = vec_unif)
list_subfig[2].set_title("Uniform Distribution")
list_subfig[2].set_xlabel("Value")
#list_subfig[2].set_ylabel("Frequency")
Text(0.5, -7.277777777777782, 'Value')
What happens to $\overline{X}$ if we draw different samples?
# We will draw random sample "num_simulations" times
# Each time we will create a random vector of size "sample_size"
# In this example we will generate values from a uniform between -2 and 2.
num_simulations = 2000
sample_size = 100
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_unif = np.random.uniform(low = -2, high=2, size = sample_size)
vec_xbar[iteration] = vec_unif.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar with different simulated samples")
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
What happens to $\overline{X}$ if we draw with different $n$?
This is called the Central Limit Theorem in Statistics
# One way is to write this with repeated code chunks
# We just repeat the code that we had above, with different sample sizes
# Each time will start the process of generating new data from scratch.
num_simulations = 2000
# Simulate with sample size one
sample_size = 1
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_unif = np.random.uniform(low = -2, high=2, size = sample_size)
vec_xbar[iteration] = vec_unif.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar with size 1")
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
# Simulate with sample size 10
sample_size = 10
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_unif = np.random.uniform(low = -2, high=2, size = sample_size)
vec_xbar[iteration] = vec_unif.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar with size 10")
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
# Simulate with sample size 50
sample_size = 50
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_unif = np.random.uniform(low = -2, high=2, size = sample_size)
vec_xbar[iteration] = vec_unif.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar with size 50")
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
We can use nested loops to simply our code.
# To evaluate different sample size which just have to write a for-loop within
# another for-loop
num_simulations = 2000
sample_size_list = [1, 10, 50]
for sample_size in sample_size_list:
# The following command a vector null values, of length "num_simulations"
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_unif = np.random.uniform(low = -2, high=2, size = sample_size)
vec_xbar[iteration] = vec_unif.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar when n is " + str(sample_size))
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
*Exercise*
num_simulations = 2000
sample_size_list = [1, 10, 50]
for sample_size in sample_size_list:
# The following command a vector null values, of length "num_simulations"
vec_xbar = [None] * num_simulations
for iteration in range(num_simulations):
vec_chisqr = np.random.chisquare(df = 1, size = sample_size)
vec_xbar[iteration] = vec_chisqr.mean()
plt.hist(vec_xbar)
plt.title("Distribution of Xbar when n is " + str(sample_size))
plt.ylabel("Frequency")
plt.xlabel("Values of Xbar")
plt.show()
*Exercise*
num_simulations = 2000
sample_size_list = [1, 10, 50]
list_index = [0, 1, 2]
fig, list_subfig = plt.subplots(1, 3, figsize = (9, 3))
index = 0
for sample_size in sample_size_list:
# The following command a vector null values, of length "num_simulations"
vec_xbar_chisqr = [None] * num_simulations
for iteration in range(num_simulations):
vec_chisqr = np.random.chisquare(df = 1, size = sample_size)
vec_xbar_chisqr[iteration] = vec_chisqr.mean()
list_subfig[index].hist(vec_xbar_chisqr)
list_subfig[index].set_title(f"Distribution of Xbar (n = {str(sample_size)})")
list_subfig[index].set_ylabel("Frequency")
list_subfig[index].set_xlabel("Values of Xbar")
index += 1
Let ''sample_std'' be the sample standard deviation of $X_i$.
# Parameters of a normal random variable
n = 10000
population_mean = 2
population_stdv = 5
# Create random variable and produce summary statistics
X = np.random.normal(loc = 2,scale = 5,size = n)
Xbar = X.mean()
sample_stdv = X.std()
# Check that the sample and standard deviation are close to their
# population values
print(Xbar)
print(sample_stdv)
2.0480684945806265 4.988722335956281
A 95% normal confidence interval is defined by $\ldots$
# Compute new variables for the upper and lower bound
lower_bound = Xbar - 1.96*(sample_stdv / np.sqrt(n))
upper_bound = Xbar + 1.96*(sample_stdv / np.sqrt(n))
The following code testifies the following conditions:
lower_bound $\quad \le \quad $ population_mean $\quad \le \quad$ upper_bound
if population_mean >= lower_bound and population_mean <= upper_bound:
print("The population mean is in the 95% normal confidence interval!")
else:
print("The population mean is not in the 95\% confidence intervla :(")
The population mean is in the 95% normal confidence interval!
*Exercise*: Test Whether This is a 95% Confidence Interval
Procedure:
Create a variable called "num_simulations" with value 1000
Create the simulation parameters "n", "population_mean", "populations_stdv".
Create an empty vector called "list_test_confidenceinterval".
Create a loop. At each iteration:
Create a vector of normal random variables of size "n".
Create a variable "test_confidenceinterval", which tests:
lower_bound $\quad \le \quad $ population_mean $\quad \le \quad$ upper_bound
Append "test_confidenceinterval" to the above list
Compute the mean of "list_test_confidenceinterval"
Note: The final result should be close to 95%.
num_simulations = 1000
n = 100
population_mean = 2
populations_stdv = 5
list_test_confidenceinterval = []
for interation in range(num_simulations):
vec_normal = np.random.normal(population_mean, population_stdv, n)
Xbar = vec_normal.mean()
sample_stdv = vec_normal.std()
lower_bound = Xbar - 1.96*(sample_stdv / np.sqrt(n))
upper_bound = Xbar + 1.96*(sample_stdv / np.sqrt(n))
if population_mean >= lower_bound and population_mean <= upper_bound:
test_confidenceinterval = True
else:
test_confidenceinterval = False
list_test_confidenceinterval.append((test_confidenceinterval))
np.mean(list_test_confidenceinterval)
1.0
Create an empty dataset
dataset = pd.DataFrame([])
Create two random variables of size ($n = 50$)
n = 50
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)
Create data from the linear model
$ y = b_0 + b_1 x + e, \qquad b_0 = 1, b_1 = 2.$
# The number b0 is known as the "intercept"
# The number b1 is known as the "slope"
b0 = 1
b1 = 2
# We can compute formulas directly over dataset columns
dataset["y"] = b0 + b1 * dataset["x"] + dataset["e"]
Compute the theoretically best fit line
$ p = b_0 + b_1 x$
dataset["p"] = b0 + b1 * dataset["x"]
Plot the data
plt.scatter(x = dataset["x"], y = dataset["y"])
plt.scatter(x = dataset["x"], y = dataset["p"])
plt.xlabel("X Variable")
plt.ylabel("Y Variable")
plt.legend(labels = ["Data points", "Best fit line"])
plt.show()
*Example*
subset_above2.query()len(dataset)len(subset_above2)subset_above2 = dataset.query("y >= 2")
print(len(dataset))
print(len(subset_above2))
print(f"{len(subset_above2) / len(dataset) * 100} %")
50 17 34.0 %
*Example*
ybarstdv_sample.query() to subset observations that satisfy$ \qquad abs\left(y - ybar \right) \le stdv\_sample $
$\quad$ HINT: Use .mean(),$\text{ }$ .std()
$\quad$ HINT: Use the globals $\ $ @xbar,$\text{ }$ @stdv_sample
# Note: abs(...) is the absolute value function
# Write your own code
ybar = dataset["y"].mean()
stdv_sample = dataset["y"].std()
subset = dataset.query("abs(y - @ybar) <= @stdv_sample")
We have data on $(y,x)$ but we don't know $(b_0,b_1)$
Let's fit an OLS model
#------------------------------------------------------------------------------#
# We use the subfunction "ols()" in the library "smf"
#---- (i) The first argument is a string called "formula" with the format
#-------- "outcome ~ indepdent_vars"
#----(ii) the second argument is the dataset
# The second line fits the model with standard errors "cov". In this case we
# use "robust" standard errors (HC1)
#-------------------------------------------------------------------------------#
model = smf.ols(formula = 'y ~ x',data = dataset)
results = model.fit(cov = "HC1")
# Can also run as one line
# results = smf.ols(formula = 'y ~ x',data = dataset).fit(cov = "HC1")
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.723
Model: OLS Adj. R-squared: 0.718
Method: Least Squares F-statistic: 125.5
Date: Sun, 26 Feb 2023 Prob (F-statistic): 5.40e-15
Time: 16:40:41 Log-Likelihood: -72.754
No. Observations: 50 AIC: 149.5
Df Residuals: 48 BIC: 153.3
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 1.0856 0.151 7.179 0.000 0.782 1.390
x 1.8988 0.169 11.203 0.000 1.558 2.240
==============================================================================
Omnibus: 1.364 Durbin-Watson: 1.868
Prob(Omnibus): 0.506 Jarque-Bera (JB): 1.148
Skew: 0.365 Prob(JB): 0.563
Kurtosis: 2.870 Cond. No. 1.20
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Compute the estimated best fit line
# We will use ".params" to get the attribute "parameters from the results"
b_list = results.params
print(b_list)
# We can then compute the "estimated" best fit lines
# by extracting the intercept and slop from "b_list"
dataset["p_estimated"] = b_list[0] + b_list[1] * dataset["x"]
# Note: The estimators for "b0" and "b1" are close to
# the values we used to generate the data
Intercept 1.085574 x 1.898793 dtype: float64
Plot the best fit line
# Use scatter twice, with different "y" inputs
# THe "legend" command creates a box on with the color labels
plt.scatter(x = dataset["x"],y = dataset["y"])
plt.scatter(x = dataset["x"],y = dataset["p_estimated"])
plt.legend(labels = ["Data points","Estimated Predicted Model"])
plt.show()
*Example*
plt.scatter(x = dataset["x"], y = dataset["p"])
plt.scatter(x = dataset["x"], y = dataset["p_estimated"])
plt.legend(labels = ["Our Model", "Estimated Line of Best Fit"])
plt.show()
*Example*
$\quad$ sample_error = y - p_estimated
$\quad$ fn_positive_error error: error >= 0
using .apply()
dataset["sample_error"] = dataset["y"] - dataset["p_estimated"]
fn_positive_error = lambda error: error >= 0
dataset["sample_error_positive_check"] = dataset["sample_error"].apply(fn_positive_error)
*Example*
error_sqr = sample_error ** 2
error_sqrdataset["error_sqr"] = dataset["sample_error"] ** 2
dataset["error_sqr"].mean()
1.0749508086915205
Create an empty dataset
dataset = pd.DataFrame([])
Create three random variables of size ($n = 100$)
n = 100
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["z"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)
Create discrete random variable ($n = 100$)
dataset["d"] = np.random.choice(a = [1,2,3],
size = n,
p = [0.2,0.2,0.6])
Create data from the linear model
$ y = 2 + 5 x + e$
# We can compute formulas directly over dataset columns
dataset["y"] = 2 + 5 * dataset["x"] + dataset["x"] * dataset["e"]
Summaries for univariate regression
# Run the model with multiple variables by using "+"
results_univariate = smf.ols(formula = 'y ~ x',data = dataset).fit(cov_type= "HC1")
# The "summary_col" functions produces nice outputs
# We can add notation for significance by setting "stars" to True
print(summary_col(results_univariate,
stars = True))
========================
y
------------------------
Intercept 2.1850***
(0.1171)
x 5.0127***
(0.1245)
R-squared 0.9675
R-squared Adj. 0.9671
========================
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01
print(results_univariate.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.967
Model: OLS Adj. R-squared: 0.967
Method: Least Squares F-statistic: 1622.
Date: Sun, 26 Feb 2023 Prob (F-statistic): 8.86e-63
Time: 16:44:33 Log-Likelihood: -156.10
No. Observations: 100 AIC: 316.2
Df Residuals: 98 BIC: 321.4
Df Model: 1
Covariance Type: HC1
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.1850 0.117 18.658 0.000 1.956 2.415
x 5.0127 0.124 40.273 0.000 4.769 5.257
==============================================================================
Omnibus: 8.870 Durbin-Watson: 1.738
Prob(Omnibus): 0.012 Jarque-Bera (JB): 9.509
Skew: 0.543 Prob(JB): 0.00861
Kurtosis: 4.050 Cond. No. 1.27
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity robust (HC1)
Summaries for multivariate regression
# Run the model with multiple variables by using "+"
results_multivariate = smf.ols(formula = 'y ~ x + z',
data = dataset).fit(cov_type = "HC1")
print(summary_col(results_multivariate,
stars = True))
========================
y
------------------------
Intercept 2.1733***
(0.1157)
x 5.0098***
(0.1236)
z -0.1158
(0.1143)
R-squared 0.9678
R-squared Adj. 0.9671
========================
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01
Summaries for multivariate regression + categories
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_category = smf.ols(formula = 'y ~ x + C(d)',
data = dataset).fit(cov_type = "HC1")
# The results are reported with a base category, T.1
print(summary_col(results_multivariate_category,
stars = True))
========================
y
------------------------
Intercept 1.9636***
(0.2620)
C(d)[T.2] 0.0705
(0.4091)
C(d)[T.3] 0.2954
(0.2976)
x 5.0094***
(0.1245)
R-squared 0.9678
R-squared Adj. 0.9668
========================
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01
Summaries for multivariate regression + interaction
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_interaction = smf.ols(formula = 'y ~ x + z + z:x',
data = dataset).fit(cov_type = "HC1")
# The results are reported with a base category, T.1
print(summary_col(results_multivariate_interaction,
stars = True))
========================
y
------------------------
Intercept 2.1704***
(0.1166)
x 4.9888***
(0.1260)
z -0.1035
(0.1210)
z:x -0.1246
(0.1425)
R-squared 0.9682
R-squared Adj. 0.9672
========================
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01
Summaries for multiple columns
list_results = [results_univariate,
results_multivariate,
results_multivariate_category,
results_multivariate_interaction]
print(summary_col(list_results,
stars = True))
======================================================
y I y II y III y IIII
------------------------------------------------------
C(d)[T.2] 0.0705
(0.4091)
C(d)[T.3] 0.2954
(0.2976)
Intercept 2.1850*** 2.1733*** 1.9636*** 2.1704***
(0.1171) (0.1157) (0.2620) (0.1166)
R-squared 0.9675 0.9678 0.9678 0.9682
R-squared Adj. 0.9671 0.9671 0.9668 0.9672
x 5.0127*** 5.0098*** 5.0094*** 4.9888***
(0.1245) (0.1236) (0.1245) (0.1260)
z -0.1158 -0.1035
(0.1143) (0.1210)
z:x -0.1246
(0.1425)
======================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Summaries for multiple columns (sorted + titled + stats)
# This list inputs the headings of the table
list_headings = ["Univariate",
"Multivariate",
"Categorical",
"Interaction"]
# This is the list of regressor names (if you want a particular order)
list_regressors = ["x",
"z",
"z:x",
"C(d)[T.2]",
"C(d)[T.3]"]
# This is a function that extracts the sample size
# Can use with other summary statistics
# "nobs" is the number of observations
compute_summary = {'N':lambda model: format(int(model.nobs))}
print(summary_col(list_results,
stars = True,
model_names = list_headings,
info_dict={'N':lambda x: format(int(x.nobs))},
regressor_order = ["x","z","z:x","C(d)[T.2]","C(d)[T.3]"]))
==============================================================
Univariate Multivariate Categorical Interaction
--------------------------------------------------------------
x 5.0127*** 5.0098*** 5.0094*** 4.9888***
(0.1245) (0.1236) (0.1245) (0.1260)
z -0.1158 -0.1035
(0.1143) (0.1210)
z:x -0.1246
(0.1425)
C(d)[T.2] 0.0705
(0.4091)
C(d)[T.3] 0.2954
(0.2976)
Intercept 2.1850*** 2.1733*** 1.9636*** 2.1704***
(0.1171) (0.1157) (0.2620) (0.1166)
R-squared 0.9675 0.9678 0.9678 0.9682
R-squared Adj. 0.9671 0.9671 0.9668 0.9672
N 100 100 100 100
==============================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Detailed table
print(results_univariate.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.967
Model: OLS Adj. R-squared: 0.967
Method: Least Squares F-statistic: 1622.
Date: Sun, 26 Feb 2023 Prob (F-statistic): 8.86e-63
Time: 16:46:10 Log-Likelihood: -156.10
No. Observations: 100 AIC: 316.2
Df Residuals: 98 BIC: 321.4
Df Model: 1
Covariance Type: HC1
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.1850 0.117 18.658 0.000 1.956 2.415
x 5.0127 0.124 40.273 0.000 4.769 5.257
==============================================================================
Omnibus: 8.870 Durbin-Watson: 1.738
Prob(Omnibus): 0.012 Jarque-Bera (JB): 9.509
Skew: 0.543 Prob(JB): 0.00861
Kurtosis: 4.050 Cond. No. 1.27
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity robust (HC1)